From c57d577e8dc41f270a3ce0d604f5d8ac51b08ed7 Mon Sep 17 00:00:00 2001 From: Calvin Chen <45745657+calvin0327@users.noreply.github.com> Date: Tue, 3 Jun 2025 03:38:23 +0800 Subject: [PATCH 001/115] add an absolute path for run.sh (#18258) Signed-off-by: calvin chen <120380290@qq.com> --- .../disaggregated-prefill-v1/run.sh | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/examples/offline_inference/disaggregated-prefill-v1/run.sh b/examples/offline_inference/disaggregated-prefill-v1/run.sh index 0ebf45a1586a0..c1dcc95a2bd0b 100644 --- a/examples/offline_inference/disaggregated-prefill-v1/run.sh +++ b/examples/offline_inference/disaggregated-prefill-v1/run.sh @@ -1,5 +1,11 @@ rm -rf local_storage/ -rm output.txt -VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=0 python3 prefill_example.py -VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=0 python3 decode_example.py +if [ -f "output.txt" ]; then + rm output.txt +fi + +# The directory of current script +SCRIPT_DIR=$(dirname "$(readlink -f "$0")") + +VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=0 python3 "$SCRIPT_DIR/prefill_example.py" +VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=0 python3 "$SCRIPT_DIR/decode_example.py" From 9112b443a042d8d815880b8780633882ad32b183 Mon Sep 17 00:00:00 2001 From: Siyuan Liu Date: Mon, 2 Jun 2025 17:06:20 -0700 Subject: [PATCH 002/115] [Hardware][TPU] Initial support of model parallelism with single worker using SPMD (#18011) Signed-off-by: Siyuan Liu Co-authored-by: Hossein Sarshar Co-authored-by: Chengji Yao --- .../scripts/hardware_ci/run-tpu-v1-test.sh | 4 + examples/offline_inference/tpu.py | 29 ++- .../v1/tpu/test_spmd_model_weight_loading.py | 67 +++++++ tests/v1/tpu/test_tpu_qkv_linear.py | 89 +++++++++ vllm/config.py | 2 + vllm/distributed/tpu_distributed_utils.py | 177 ++++++++++++++++++ vllm/envs.py | 5 + vllm/model_executor/model_loader/tpu.py | 112 +++++++++++ vllm/model_executor/utils.py | 4 +- vllm/v1/worker/tpu_model_runner.py | 101 ++++++---- vllm/v1/worker/tpu_worker.py | 87 +++++---- 11 files changed, 605 insertions(+), 72 deletions(-) create mode 100644 tests/v1/tpu/test_spmd_model_weight_loading.py create mode 100644 tests/v1/tpu/test_tpu_qkv_linear.py create mode 100644 vllm/distributed/tpu_distributed_utils.py create mode 100644 vllm/model_executor/model_loader/tpu.py diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh index 6102431456210..3212b660ec356 100755 --- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh +++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh @@ -155,6 +155,10 @@ run_and_track_test 12 "test_moe_pallas.py" \ "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py" run_and_track_test 13 "test_lora.py" \ "VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py" +run_and_track_test 14 "test_tpu_qkv_linear.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py" +run_and_track_test 15 "test_spmd_model_weight_loading.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py" # After all tests have been attempted, exit with the overall status. if [ "$overall_script_exit_code" -ne 0 ]; then diff --git a/examples/offline_inference/tpu.py b/examples/offline_inference/tpu.py index e4a75b3f93803..f3c2859d44d17 100644 --- a/examples/offline_inference/tpu.py +++ b/examples/offline_inference/tpu.py @@ -1,5 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 +import argparse +import os + from vllm import LLM, SamplingParams prompts = [ @@ -18,14 +21,28 @@ sampling_params = SamplingParams(temperature=0, top_p=1.0, n=N, max_tokens=16) def main(): + parser = argparse.ArgumentParser(description="TPU offline inference example") + parser.add_argument("--use-spmd", action="store_true", help="Enable SPMD mode") + args = parser.parse_args() + + llm_args = { + "model": "Qwen/Qwen2-1.5B-Instruct", + "max_num_batched_tokens": 64, + "max_num_seqs": 4, + "max_model_len": 128, + } + if args.use_spmd: + os.environ["VLLM_XLA_USE_SPMD"] = "1" + # Can only hardcode the number of chips for now. + # calling xr.global_runtime_device_count() beforeing init SPMD env in + # torch_xla will mess up the distributed env. + llm_args["tensor_parallel_size"] = 8 + # Use Llama, for num_kv_heads = 8. + llm_args["model"] = "meta-llama/Llama-3.1-8B-Instruct" + # Set `enforce_eager=True` to avoid ahead-of-time compilation. # In real workloads, `enforace_eager` should be `False`. - llm = LLM( - model="Qwen/Qwen2-1.5B-Instruct", - max_num_batched_tokens=64, - max_num_seqs=4, - max_model_len=128, - ) + llm = LLM(**llm_args) outputs = llm.generate(prompts, sampling_params) print("-" * 50) for output, answer in zip(outputs, answers): diff --git a/tests/v1/tpu/test_spmd_model_weight_loading.py b/tests/v1/tpu/test_spmd_model_weight_loading.py new file mode 100644 index 0000000000000..d36edfc3fb618 --- /dev/null +++ b/tests/v1/tpu/test_spmd_model_weight_loading.py @@ -0,0 +1,67 @@ +# SPDX-License-Identifier: Apache-2.0 +import gc +import tempfile + +import numpy as np +import pytest +import torch_xla.distributed.spmd as xs +import torch_xla.runtime as xr + +from vllm.config import set_current_vllm_config +from vllm.distributed.parallel_state import (ensure_model_parallel_initialized, + init_distributed_environment) +from vllm.engine.arg_utils import EngineArgs +from vllm.model_executor.model_loader.tpu import TPUModelLoader + + +def _setup_environment(model): + engine_args = EngineArgs(model=model, ) + vllm_config = engine_args.create_engine_config() + with set_current_vllm_config(vllm_config): + temp_file = tempfile.mkstemp()[1] + init_distributed_environment( + 1, + 0, + local_rank=0, + distributed_init_method=f"file://{temp_file}", + backend="gloo") + # Under single worker mode, full model is init first and then + # partitioned using GSPMD. + ensure_model_parallel_initialized(1, 1) + return vllm_config + + +MESH = None + + +def _get_spmd_mesh(): + global MESH + if MESH is None: + xr.use_spmd() + num_devices = xr.global_runtime_device_count() + mesh_shape = (num_devices, 1) + device_ids = np.array(range(num_devices)) + MESH = xs.Mesh(device_ids, mesh_shape, ('x', 'y')) + return MESH + + +@pytest.mark.parametrize("model", [ + "Qwen/Qwen2-1.5B-Instruct", + "meta-llama/Llama-3.1-8B-Instruct", + "meta-llama/Llama-3.1-70B-Instruct", +]) +def test_tpu_model_loader(model): + # Skip the 70B test if there are less than 8 chips + # TODO: Query using torch xla API, the query API is not working + # with SPMD now. However, This test is running under SPMD mode. + if '70B' in model and xr.global_runtime_device_count() < 8: + pytest.skip( + "Skipping 70B model if the TPU VM has less than 8 chips to \ + avoid OOM.") + + vllm_config = _setup_environment(model) + loader = TPUModelLoader(load_config=vllm_config.load_config) + mesh = _get_spmd_mesh() + model = loader.load_model(vllm_config, vllm_config.model_config, mesh) + del model + gc.collect() diff --git a/tests/v1/tpu/test_tpu_qkv_linear.py b/tests/v1/tpu/test_tpu_qkv_linear.py new file mode 100644 index 0000000000000..b98570f01a7f2 --- /dev/null +++ b/tests/v1/tpu/test_tpu_qkv_linear.py @@ -0,0 +1,89 @@ +# SPDX-License-Identifier: Apache-2.0 +import tempfile + +import numpy as np +import pytest +import torch +import torch_xla.distributed.spmd as xs +import torch_xla.runtime as xr + +from vllm.config import set_current_vllm_config +from vllm.distributed.parallel_state import (ensure_model_parallel_initialized, + init_distributed_environment) +from vllm.distributed.tpu_distributed_utils import XlaQKVParallelLinear +from vllm.engine.arg_utils import EngineArgs +from vllm.model_executor.layers.linear import QKVParallelLinear + + +@pytest.fixture(autouse=True) +def setup_environment(): + # This is a fake config used for init dist env. + # QKVParallelLinear needs dist env to be initialized. + engine_args = EngineArgs( + model="Qwen/Qwen2-1.5B-Instruct", + max_model_len=64, + max_num_batched_tokens=64, + max_num_seqs=4, + ) + + vllm_config = engine_args.create_engine_config() + + with set_current_vllm_config(vllm_config): + temp_file = tempfile.mkstemp()[1] + init_distributed_environment( + 1, + 0, + local_rank=0, + distributed_init_method=f"file://{temp_file}", + backend="gloo") + ensure_model_parallel_initialized(1, 1) + yield + + +MESH = None + + +def _get_spmd_mesh(): + global MESH + if MESH is None: + xr.use_spmd() + num_devices = xr.global_runtime_device_count() + mesh_shape = (num_devices, 1) + device_ids = np.array(range(num_devices)) + MESH = xs.Mesh(device_ids, mesh_shape, ('x', 'y')) + return MESH + + +@pytest.mark.parametrize("bias", [False, True]) +# `xr.use_spmd()` will set a global state, and this state is not reversible. +# Therefore, non-SPMD tests should be run before SPMD tests. +@pytest.mark.parametrize("mesh", [None, _get_spmd_mesh()]) +@pytest.mark.parametrize("device", ['cpu', 'xla']) +@torch.no_grad() +def test_xla_qkv_linear(bias, mesh, device): + torch.manual_seed(123) + + qkv_linear = QKVParallelLinear( + hidden_size=4096, + head_size=128, + total_num_heads=32, + total_num_kv_heads=8, + bias=bias, + params_dtype=torch.bfloat16, + return_bias=False, + ) + + qkv_linear.weight.data = torch.rand_like(qkv_linear.weight.data) / 10 + if bias: + qkv_linear.bias.data = torch.rand_like(qkv_linear.bias.data) + + xla_qkv_linear = XlaQKVParallelLinear(qkv_linear, mesh=mesh) + + qkv_linear = qkv_linear.to(device) + xla_qkv_linear = xla_qkv_linear.to(device) + input_tensor = torch.rand(10, 4096, dtype=torch.bfloat16) / 10 + input_tensor = input_tensor.to(device) + + output = qkv_linear(input_tensor) + xla_output = xla_qkv_linear(input_tensor) + assert torch.allclose(output.cpu(), xla_output.cpu()) diff --git a/vllm/config.py b/vllm/config.py index d0891d670b76d..1bd53e35b0532 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1901,6 +1901,8 @@ class ParallelConfig: if current_platform.is_neuron(): # neuron uses single process to control multiple devices backend = "uni" + elif current_platform.is_tpu() and envs.VLLM_XLA_USE_SPMD: + backend = "uni" elif (current_platform.is_cuda() and cuda_device_count_stateless() < self.world_size): if not ray_found: diff --git a/vllm/distributed/tpu_distributed_utils.py b/vllm/distributed/tpu_distributed_utils.py new file mode 100644 index 0000000000000..36ab2eb3a62f6 --- /dev/null +++ b/vllm/distributed/tpu_distributed_utils.py @@ -0,0 +1,177 @@ +# SPDX-License-Identifier: Apache-2.0 +from collections import OrderedDict +from typing import Optional + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch_xla.distributed.spmd as xs +from torch.nn.parameter import Parameter + +from vllm.logger import init_logger +from vllm.model_executor.layers.linear import (ColumnParallelLinear, + QKVParallelLinear, + RowParallelLinear) + +logger = init_logger(__name__) + + +class XlaQKVParallelLinear(nn.Module): + + def __init__(self, + qkv_linear: nn.Module, + mesh: Optional["xs.Mesh"] = None): + super().__init__() + assert isinstance(qkv_linear, QKVParallelLinear) + self.skip_bias_add = qkv_linear.skip_bias_add + self.return_bias = qkv_linear.return_bias + assert qkv_linear.tp_size == 1, "TP > 1 is only supported under SPMD." + + self.q_weight: Parameter + self.k_weight: Parameter + self.v_weight: Parameter + self.q_bias: Optional[Parameter] + self.k_bias: Optional[Parameter] + self.v_bias: Optional[Parameter] + self._load_weights_from_qkv_linear(qkv_linear) + if mesh is not None: + self._shard_weight(mesh) + + def _shard_weight(self, mesh: "xs.Mesh"): + self.q_weight = Parameter(self.q_weight.to('xla'), requires_grad=False) + self.k_weight = Parameter(self.k_weight.to('xla'), requires_grad=False) + self.v_weight = Parameter(self.v_weight.to('xla'), requires_grad=False) + xs.mark_sharding(self.q_weight, mesh, ('x', None)) + xs.mark_sharding(self.k_weight, mesh, ('x', None)) + xs.mark_sharding(self.v_weight, mesh, ('x', None)) + if self.q_bias is not None: + assert self.k_bias is not None and self.v_bias is not None, \ + "QKVParallelLinear should have q, k, and v biases together." + self.q_bias = Parameter(self.q_bias.to('xla'), requires_grad=False) + xs.mark_sharding(self.q_bias, mesh, ('x', )) + self.k_bias = Parameter(self.k_bias.to('xla'), requires_grad=False) + xs.mark_sharding(self.k_bias, mesh, ('x', )) + self.v_bias = Parameter(self.v_bias.to('xla'), requires_grad=False) + xs.mark_sharding(self.v_bias, mesh, ('x', )) + + def _load_weights_from_qkv_linear(self, qkv_linear: nn.Module): + q_proj_size, k_proj_size, _ = qkv_linear.output_sizes + # The weight of qkv linear is a concatenation of q, k, and v weights + # along the output dimension. + qkv_weight = qkv_linear.weight.data.cpu() + q_weight = Parameter(qkv_weight[:q_proj_size], requires_grad=False) + k_weight = Parameter(qkv_weight[q_proj_size:q_proj_size + k_proj_size], + requires_grad=False) + v_weight = Parameter(qkv_weight[q_proj_size + k_proj_size:], + requires_grad=False) + self.register_parameter("q_weight", q_weight) + self.register_parameter("k_weight", k_weight) + self.register_parameter("v_weight", v_weight) + + if qkv_linear.bias is not None: + q_bias = Parameter(qkv_linear.bias[:q_proj_size], + requires_grad=False) + k_bias = Parameter(qkv_linear.bias[q_proj_size:q_proj_size + + k_proj_size], + requires_grad=False) + v_bias = Parameter(qkv_linear.bias[q_proj_size + k_proj_size:], + requires_grad=False) + self.register_parameter("q_bias", q_bias) + self.register_parameter("k_bias", k_bias) + self.register_parameter("v_bias", v_bias) + else: + self.register_parameter("q_bias", None) + self.register_parameter("k_bias", None) + self.register_parameter("v_bias", None) + + def forward(self, input): + # Same forward functionality as QKVParallelLinear, but doing qkv porj + # separately. + q_bias = self.q_bias if not self.skip_bias_add else None + k_bias = self.k_bias if not self.skip_bias_add else None + v_bias = self.v_bias if not self.skip_bias_add else None + q_proj = F.linear(input, self.q_weight, q_bias) + k_proj = F.linear(input, self.k_weight, k_bias) + v_proj = F.linear(input, self.v_weight, v_bias) + # The q/k/v projections will be split outside of the QKVParallelLinear. + # Because we are replacing XlaQKVParallelLinear with the + # QKVParallelLinear, we need to concatenate q, k, and v projections to + # match the output shape of the QKVParallelLinear implementation even if + # it seems to be redundant. + # The concat and the following split will be noop, and should be + # optimized away by the compiler. + qkv_proj = torch.cat([q_proj, k_proj, v_proj], dim=-1) + output_bias = torch.cat([q_bias, k_bias, v_bias], dim=-1) if \ + self.skip_bias_add else None + if not self.return_bias: + return qkv_proj + return qkv_proj, output_bias + + +def partition_column_parallel_linear(layer: torch.nn.Module, + mesh: xs.Mesh) -> torch.nn.Module: + assert isinstance(layer, ColumnParallelLinear) + xs.mark_sharding(layer.weight, mesh, ('x', None)) + logger.debug("Applied column-parallel sharding to %s", layer) + return layer + + +def partition_row_parallel_linear(layer: torch.nn.Module, + mesh: xs.Mesh) -> torch.nn.Module: + assert isinstance(layer, RowParallelLinear) + xs.mark_sharding(layer.weight, mesh, (None, 'x')) + logger.debug("Applied row-parallel sharding to %s", layer) + return layer + + +def partition_qkv_parallel_linear(layer: torch.nn.Module, + mesh: xs.Mesh) -> torch.nn.Module: + assert isinstance(layer, QKVParallelLinear) + xla_layer = XlaQKVParallelLinear(layer, mesh) + logger.debug("Applied qkv parallel sharding to %s", layer) + return xla_layer + + +MODULE_TYPE_TO_WRAPPING_FUNC = OrderedDict([ + ("QKVParallelLinear", partition_qkv_parallel_linear), + ("ColumnParallelLinear", partition_column_parallel_linear), + ("RowParallelLinear", partition_row_parallel_linear), +]) + + +def get_fqn(module): + # Get the fully qualified name of the module + return module.__class__.__qualname__ + + +def shard_model(model: torch.nn.Module, mesh: "xs.Mesh") -> None: + """ + Recursively check a PyTorch model and apply appropriate sharding based on + the MODULE_TYPE_TO_WRAPPING_FUNC mapping. + + Args: + model: torch.nn.Module to process + mesh: An XLA SPMD mesh object used for sharding + """ + + def _process_module(module, name=None, parent=None): + for module_type, wrapping_func in MODULE_TYPE_TO_WRAPPING_FUNC.items(): + if get_fqn(module) == module_type: + wrapped_module = wrapping_func(module, mesh) + + assert parent is not None and name is not None, ( + "Top Level module is not expected to be wrapped.") + if wrapped_module is not module: + # Wrapped module and module are different py object. + # The original module should be replaced by the + # wrapped_module. + logger.debug("replace %s with %s", module, wrapped_module) + setattr(parent, name, wrapped_module) + + module = wrapped_module + break + + for child_name, child_module in list(module.named_children()): + _process_module(child_module, child_name, module) + + _process_module(model) diff --git a/vllm/envs.py b/vllm/envs.py index 44baf5a189b43..3dd0d9045372f 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -51,6 +51,7 @@ if TYPE_CHECKING: VLLM_USE_RAY_COMPILED_DAG: bool = False VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE: str = "auto" VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM: bool = False + VLLM_XLA_USE_SPMD: bool = False VLLM_WORKER_MULTIPROC_METHOD: str = "fork" VLLM_ASSETS_CACHE: str = os.path.join(VLLM_CACHE_ROOT, "assets") VLLM_IMAGE_FETCH_TIMEOUT: int = 5 @@ -513,6 +514,10 @@ environment_variables: dict[str, Callable[[], Any]] = { # If set, assert on XLA recompilation after each execution step. "VLLM_XLA_CHECK_RECOMPILATION": lambda: bool(int(os.getenv("VLLM_XLA_CHECK_RECOMPILATION", "0"))), + + # Enable SPMD mode for TPU backend. + "VLLM_XLA_USE_SPMD": + lambda: bool(int(os.getenv("VLLM_XLA_USE_SPMD", "0"))), "VLLM_FUSED_MOE_CHUNK_SIZE": lambda: int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768")), diff --git a/vllm/model_executor/model_loader/tpu.py b/vllm/model_executor/model_loader/tpu.py new file mode 100644 index 0000000000000..6197bcdba826b --- /dev/null +++ b/vllm/model_executor/model_loader/tpu.py @@ -0,0 +1,112 @@ +# SPDX-License-Identifier: Apache-2.0 +import time +from typing import Optional + +import torch +import torch.nn as nn +import torch_xla.core.xla_model as xm +import torch_xla.distributed.spmd as xs + +from vllm.config import ModelConfig, VllmConfig +from vllm.distributed.tpu_distributed_utils import get_fqn, shard_model +from vllm.logger import init_logger +from vllm.model_executor.model_loader.default_loader import DefaultModelLoader +from vllm.model_executor.model_loader.utils import ( + initialize_model, process_weights_after_loading, set_default_torch_dtype) + +logger = init_logger(__name__) + + +class TPUModelLoader(DefaultModelLoader): + """ + A TPU model loader for model loading under SPMD mode. + """ + + def load_model( + self, + vllm_config: VllmConfig, + model_config: ModelConfig, + mesh: Optional[xs.Mesh] = None, + ) -> nn.Module: + # Initialize model and load weights on CPU. Then, during SPMD partition, + # weights are sharded and transferred to TPUs. + self.counter_before_loading_weights = time.perf_counter() + model_config = vllm_config.model_config + assert model_config.quantization is None, "Quantization not supported" + target_device = torch.device('cpu') + with set_default_torch_dtype(model_config.dtype): + with target_device: + model = initialize_model(vllm_config=vllm_config) + + load_format = vllm_config.load_config.load_format + if load_format != "dummy": + weights_to_load = { + name + for name, _ in model.named_parameters() + } + all_weights = self.get_all_weights(model_config, model) + loaded_weights = model.load_weights(all_weights) + self.counter_after_loading_weights = time.perf_counter() + logger.info( + "Loading weights took %.2f seconds", + self.counter_after_loading_weights - + self.counter_before_loading_weights) + # We only enable strict check for non-quantized models + # that have loaded weights tracking currently. + if model_config.quantization is None and \ + loaded_weights is not None: + weights_not_loaded = weights_to_load - loaded_weights + if weights_not_loaded: + raise ValueError( + "Following weights were not initialized from " + f"checkpoint: {weights_not_loaded}") + else: + logger.info("Use dummy weight during weight loading.") + + process_weights_after_loading(model, model_config, target_device) + + counter_before_partition = time.perf_counter() + model = model.eval() + model = model.to('xla') + shard_model(model, mesh) + counter_after_partition = time.perf_counter() + logger.info("Partition model took %.2f seconds", + counter_after_partition - counter_before_partition) + + # Ensure the model is properly loaded. + self._check_model_is_loaded(mesh, model) + + # Need to torch compile after model sharding are done. Because the + # compiler hints ('xs.mark_sharding') are torch ops. + if not model_config.is_multimodal_model: + model.model = torch.compile(model.model, backend="openxla") + else: + model.language_model.model = \ + torch.compile(model.language_model.model, backend="openxla") + return model + + def _check_model_is_loaded(self, mesh: Optional[xs.Mesh], + model: nn.Module) -> None: + """ + Ensure the model is properly loaded. + 1. All model parameters and buffers are on XLA device. + 2. Non-SPMD friendly layers are replaced as expected. + """ + device = xm.xla_device() + device_type = str(device.type) + + # Check parameters + for name, param in model.named_parameters(): + assert param.device.type == device_type, f"Parameter {name} is on \ + {param.device.type} instead of {device_type}" + + # Check buffers + for name, buffer in model.named_buffers(): + assert buffer.device.type == device_type, \ + f"Buffer {name} is on {buffer.device.type} instead of \ + {device_type}" + + for module in model.modules(): + if (mesh is not None) and (get_fqn(module) == 'QKVParallelLinear'): + raise AssertionError("QKVParallelLinear should be replaced by \ + XlaQKVParallelLinear under SPMD mode.") diff --git a/vllm/model_executor/utils.py b/vllm/model_executor/utils.py index 1b120c3545a56..27cea65217875 100644 --- a/vllm/model_executor/utils.py +++ b/vllm/model_executor/utils.py @@ -49,7 +49,9 @@ def _make_synced_weight_loader(original_weight_loader): def _synced_weight_loader(param, *args, **kwargs): original_weight_loader(param, *args, **kwargs) - torch._sync(param) + # torch._sync doesn't support, is not needed for CPU tensors. + if param.device != torch.device("cpu"): + torch._sync(param) return _synced_weight_loader diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 5de92351e24ba..c5171b9736b36 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -7,21 +7,22 @@ from unittest.mock import patch import numpy as np import torch -import torch.distributed import torch.nn as nn # TPU XLA related import torch_xla.core.xla_model as xm +import torch_xla.distributed.spmd as xs import torch_xla.runtime as xr import vllm.envs as envs from vllm.attention.backends.abstract import AttentionType from vllm.attention.layer import Attention from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher -from vllm.config import VllmConfig, get_layers_from_vllm_config +from vllm.config import ParallelConfig, VllmConfig, get_layers_from_vllm_config from vllm.forward_context import set_forward_context from vllm.logger import init_logger from vllm.lora.layers import BaseLayerWithLoRA from vllm.model_executor.model_loader import get_model_loader +from vllm.model_executor.model_loader.tpu import TPUModelLoader from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (BatchedTensorInputs, MultiModalKwargs, PlaceholderRange) @@ -98,6 +99,7 @@ class TPUModelRunner(LoRAModelRunnerMixin): self, vllm_config: VllmConfig, device: torch.device, + original_parallel_config: Optional[ParallelConfig] = None, ): self.vllm_config = vllm_config self.model_config = vllm_config.model_config @@ -105,6 +107,7 @@ class TPUModelRunner(LoRAModelRunnerMixin): self.lora_config = vllm_config.lora_config self.load_config = vllm_config.load_config self.parallel_config = vllm_config.parallel_config + self.original_parallel_config = original_parallel_config self.scheduler_config = vllm_config.scheduler_config self.speculative_config = vllm_config.speculative_config self.prompt_adapter_config = vllm_config.prompt_adapter_config @@ -118,6 +121,14 @@ class TPUModelRunner(LoRAModelRunnerMixin): self.device = device self.check_recompilation = envs.VLLM_XLA_CHECK_RECOMPILATION + # SPMD Related + self.use_spmd = envs.VLLM_XLA_USE_SPMD + if self.use_spmd: + num_devices = xr.global_runtime_device_count() + mesh_shape = (num_devices, 1) + device_ids = np.array(range(num_devices)) + self.mesh = xs.Mesh(device_ids, mesh_shape, ('x', 'y')) + self.enforce_eager = model_config.enforce_eager self.num_xla_graphs = 0 @@ -271,6 +282,15 @@ class TPUModelRunner(LoRAModelRunnerMixin): max_num_mm_items_decoder_budget) self.max_num_mm_items_by_modality[modality] = max_num_mm_items + if not self.use_spmd: + self.sample_from_logits_func = torch.compile( + self.sample_from_logits, + backend="openxla", + fullgraph=True, + dynamic=False) + else: + self.sample_from_logits_func = self.sample_from_logits + def _update_num_xla_graphs(self, case_str): check_comp = self.check_recompilation and not self.enforce_eager if not check_comp: @@ -825,9 +845,8 @@ class TPUModelRunner(LoRAModelRunnerMixin): logits = self.structured_decode(require_struct_decoding, grammar_bitmask_padded, logits, arange) - selected_token_ids = self.sample_from_logits(logits, - tpu_sampling_metadata) - + selected_token_ids = self.sample_from_logits_func( + logits, tpu_sampling_metadata) # NOTE (NickLucche) Use the original logits (before any penalties or # temperature scaling) for the top-k logprobs. We can't enforce it due # to recompilations outside torch.compiled code, so just make sure @@ -935,18 +954,26 @@ class TPUModelRunner(LoRAModelRunnerMixin): "vllm.model_executor.layers.vocab_parallel_embedding." "get_tensor_model_parallel_rank", return_value=xm_tp_rank): - # model = get_model(vllm_config=self.vllm_config) - model_loader = get_model_loader(self.load_config) - if not hasattr(self, "model"): - logger.info("Loading model from scratch...") - model = model_loader.load_model(vllm_config=self.vllm_config, - model_config=self.model_config) + if self.use_spmd: + tpu_loader = TPUModelLoader( + load_config=self.vllm_config.load_config) + model = tpu_loader.load_model( + vllm_config=self.vllm_config, + model_config=self.vllm_config.model_config, + mesh=self.mesh) else: - logger.info( - "Model was already initialized. Loading weights inplace..." - ) - model_loader.load_weights(self.model, - model_config=self.model_config) + # model = get_model(vllm_config=self.vllm_config) + model_loader = get_model_loader(self.load_config) + if not hasattr(self, "model"): + logger.info("Loading model from scratch...") + model = model_loader.load_model( + vllm_config=self.vllm_config, + model_config=self.model_config) + else: + logger.info("Model was already initialized. \ + Loading weights inplace...") + model_loader.load_weights(self.model, + model_config=self.model_config) if self.lora_config is not None: model = self.load_lora_model(model, self.model_config, self.scheduler_config, @@ -970,31 +997,25 @@ class TPUModelRunner(LoRAModelRunnerMixin): device=self.device) else: input_ids = torch.zeros((num_tokens), - dtype=torch.int32, - device=self.device) + dtype=torch.int32).to(self.device) inputs_embeds = None actual_num_reqs = min(num_tokens, self.max_num_reqs) position_ids = torch.zeros(num_tokens, - dtype=torch.int32, - device=self.device) + dtype=torch.int32).to(self.device) slot_mapping = torch.zeros(num_tokens, - dtype=torch.int64, - device=self.device) + dtype=torch.int64).to(self.device) block_tables = torch.zeros( (self.max_num_reqs, self.block_table_cpu.shape[1]), - dtype=torch.int32, - device=self.device) + dtype=torch.int32).to(self.device) query_lens = [1] * self.max_num_reqs query_start_loc = torch.cumsum(torch.tensor([0] + query_lens, dtype=torch.int32), dim=0, dtype=torch.int32).to(self.device) context_lens = torch.ones((self.max_num_reqs, ), - dtype=torch.int32, - device=self.device) + dtype=torch.int32).to(self.device) num_seqs = torch.tensor([actual_num_reqs], - dtype=torch.int32, - device=self.device) + dtype=torch.int32).to(self.device) attn_metadata = PallasMetadata( slot_mapping=slot_mapping, block_tables=block_tables, @@ -1198,7 +1219,8 @@ class TPUModelRunner(LoRAModelRunnerMixin): with self.maybe_select_dummy_loras( self.lora_config, np.array([num_reqs], dtype=np.int32)): - self.sample_from_logits(dummy_logits, sampling_metadata) + self.sample_from_logits_func(dummy_logits, + sampling_metadata) logger.info(" -- num_seqs: %d", num_reqs) xm.wait_device_ops() end = time.perf_counter() @@ -1332,14 +1354,22 @@ class TPUModelRunner(LoRAModelRunnerMixin): assert tensor_config.size % kv_cache_spec.page_size_bytes == 0 num_blocks = tensor_config.size // kv_cache_spec.page_size_bytes if isinstance(kv_cache_spec, AttentionSpec): + if self.use_spmd: + num_kv_heads = kv_cache_spec.num_kv_heads + assert self.original_parallel_config is not None + tp_size = \ + self.original_parallel_config.tensor_parallel_size + # TODO: Handle kv cache duplication under SPMD mode. + assert num_kv_heads % tp_size == 0, ( + f"num_kv_heads {num_kv_heads} must be divisible by " + f"tp_size {tp_size} under SPMD mode") kv_cache_shape = PallasAttentionBackend.get_kv_cache_shape( num_blocks, kv_cache_spec.block_size, kv_cache_spec.num_kv_heads, kv_cache_spec.head_size) dtype = kv_cache_spec.dtype tpu_kv_cache = torch.zeros(kv_cache_shape, - dtype=dtype, - device=self.device) + dtype=dtype).to(self.device) kv_caches[layer_name] = tpu_kv_cache else: @@ -1350,6 +1380,11 @@ class TPUModelRunner(LoRAModelRunnerMixin): self.vllm_config.compilation_config.static_forward_context, self.kv_caches) + if self.use_spmd: + # Shard KV Cache + for cache in self.kv_caches: + xs.mark_sharding(cache, self.mesh, (None, 'x', None, None)) + def reset_dynamo_cache(self): if self.is_multimodal_model: compiled_model = self.model.get_language_model().model @@ -1370,7 +1405,9 @@ class TPUModelRunner(LoRAModelRunnerMixin): sample_hidden_states: torch.Tensor) -> torch.Tensor: return self.model.compute_logits(sample_hidden_states, None) - @torch.compile(backend="openxla", fullgraph=True, dynamic=False) + # TODO: Under SPMD mode, sample_from_logits has correctness issue. + # Re-enable the torch.compile once the issue is fixed in torchxla. + # @torch.compile(backend="openxla", fullgraph=True, dynamic=False) def sample_from_logits( self, logits: torch.Tensor, sampling_metadata: TPUSupportedSamplingMetadata) -> torch.Tensor: diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py index 0707e17afe7a7..bf0a5777cb3ff 100644 --- a/vllm/v1/worker/tpu_worker.py +++ b/vllm/v1/worker/tpu_worker.py @@ -45,6 +45,15 @@ class TPUWorker: self.lora_config = vllm_config.lora_config self.load_config = vllm_config.load_config self.parallel_config = vllm_config.parallel_config + self.use_spmd = envs.VLLM_XLA_USE_SPMD + self.original_parallel_config = None + if self.use_spmd: + # Under SPMD mode, distributed env is initialized as if there is + # only one worker/device. + self.original_parallel_config = self.parallel_config + self.parallel_config.tensor_parallel_size = 1 + self.parallel_config.pipeline_parallel_size = 1 + self.parallel_config.world_size = 1 self.scheduler_config = vllm_config.scheduler_config self.device_config = vllm_config.device_config self.speculative_config = vllm_config.speculative_config @@ -95,10 +104,9 @@ class TPUWorker: torch.set_default_dtype(self.model_config.dtype) # Initialize the distributed environment. - init_tpu_worker_distributed_environment(self.parallel_config, - self.rank, - self.distributed_init_method, - self.local_rank) + self._init_tpu_worker_distributed_environment( + self.parallel_config, self.rank, self.distributed_init_method, + self.local_rank) # Device initialization should happen after initializing # the distributed runtime. @@ -132,7 +140,9 @@ class TPUWorker: xr.initialize_cache(per_rank_path, readonly=False) # Init ModelRunner here, so that we have access to self.device. - self.model_runner = TPUModelRunner(self.vllm_config, self.device) + self.model_runner = \ + TPUModelRunner(self.vllm_config, self.device, + self.original_parallel_config) if rank == 0: # If usage stat is enabled, collect relevant info. @@ -147,9 +157,7 @@ class TPUWorker: # Use an empty tensor instead of `None`` to force Dynamo to pass # it by reference, rather by specializing on the value ``None``. - tpu_kv_cache = torch.tensor([], - dtype=dtype, - device=self.device) + tpu_kv_cache = torch.tensor([], dtype=dtype).to(self.device) kv_caches[layer_name] = tpu_kv_cache else: raise NotImplementedError( @@ -178,9 +186,20 @@ class TPUWorker: # Get the maximum amount of memory used by the model weights and # intermediate activations. - m = xm.get_memory_info(self.device) - total_memory_size = m["bytes_limit"] - current_mem = m["bytes_used"] + if self.use_spmd: + # This is a workaround for the TPU SPMD mode. The get_memory_info + # API doesn't work with SPMD mode in PyTorch/XLA. + # TODO: use xm.get_memory_info for SPMD once it's supported in + # PyTorch/XLA. + import tpu_info + chip_type, _ = tpu_info.device.get_local_chips() + device_usage = tpu_info.metrics.get_chip_usage(chip_type) + total_memory_size = device_usage[0].total_memory + current_mem = device_usage[0].memory_usage + else: + m = xm.get_memory_info(self.device) + total_memory_size = m["bytes_limit"] + current_mem = m["bytes_used"] # Ideally we would use profiled = m["peak_bytes_used"] to # get weights + activations. But there is memory used during # compilation / weight loading that impacts the peak and @@ -241,28 +260,30 @@ class TPUWorker: # worker will always be healthy as long as it's running. return - -def init_tpu_worker_distributed_environment( - parallel_config: ParallelConfig, - rank: int, - distributed_init_method: Optional[str] = None, - local_rank: int = -1, -) -> None: - """Initialize the distributed environment.""" - - # NOTE(woosuk): This is just to initialize the TP group and broadcast - # the input objects on CPU. The all-reduce and all-gather ops on TPU - # are invoked by `xm.all_reduce` and `xm.all_gather` which use their - # own context. - init_distributed_environment( - world_size=parallel_config.world_size, - rank=rank, - local_rank=local_rank, - distributed_init_method=distributed_init_method, - backend="gloo", - ) - ensure_model_parallel_initialized(parallel_config.tensor_parallel_size, - parallel_config.pipeline_parallel_size) + def _init_tpu_worker_distributed_environment( + self, + parallel_config: ParallelConfig, + rank: int, + distributed_init_method: Optional[str] = None, + local_rank: int = -1, + ) -> None: + """Initialize the distributed environment.""" + if self.use_spmd: + xr.use_spmd() + # NOTE(woosuk): This is just to initialize the TP group and broadcast + # the input objects on CPU. The all-reduce and all-gather ops on TPU + # are invoked by `xm.all_reduce` and `xm.all_gather` which use their + # own context. + init_distributed_environment( + world_size=parallel_config.world_size, + rank=rank, + local_rank=local_rank, + distributed_init_method=distributed_init_method, + backend="gloo", + ) + ensure_model_parallel_initialized( + parallel_config.tensor_parallel_size, + parallel_config.pipeline_parallel_size) try: From 5bc1ad6cee754405464a9957e86cf3a9302e4986 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hyogeun=20Oh=20=28=EC=98=A4=ED=9A=A8=EA=B7=BC=29?= Date: Tue, 3 Jun 2025 11:49:48 +0900 Subject: [PATCH 003/115] [Doc] Remove duplicate TOCs during MkDocs migration (#19021) Signed-off-by: Zerohertz --- docs/cli/README.md | 13 ------------- docs/deployment/nginx.md | 10 ---------- 2 files changed, 23 deletions(-) diff --git a/docs/cli/README.md b/docs/cli/README.md index 5feb316d61a89..f43ce766390ad 100644 --- a/docs/cli/README.md +++ b/docs/cli/README.md @@ -12,19 +12,6 @@ Available Commands: vllm {chat,complete,serve,bench,collect-env,run-batch} ``` -## Table of Contents - -- [serve](#serve) -- [chat](#chat) -- [complete](#complete) -- [bench](#bench) - - [latency](#latency) - - [serve](#serve-1) - - [throughput](#throughput) -- [collect-env](#collect-env) -- [run-batch](#run-batch) -- [More Help](#more-help) - ## serve Start the vLLM OpenAI Compatible API server. diff --git a/docs/deployment/nginx.md b/docs/deployment/nginx.md index 80242919ba5b3..f0ff5c1d0e76d 100644 --- a/docs/deployment/nginx.md +++ b/docs/deployment/nginx.md @@ -5,16 +5,6 @@ title: Using Nginx This document shows how to launch multiple vLLM serving containers and use Nginx to act as a load balancer between the servers. -Table of contents: - -1. [Build Nginx Container][nginxloadbalancer-nginx-build] -2. [Create Simple Nginx Config file][nginxloadbalancer-nginx-conf] -3. [Build vLLM Container][nginxloadbalancer-nginx-vllm-container] -4. [Create Docker Network][nginxloadbalancer-nginx-docker-network] -5. [Launch vLLM Containers][nginxloadbalancer-nginx-launch-container] -6. [Launch Nginx][nginxloadbalancer-nginx-launch-nginx] -7. [Verify That vLLM Servers Are Ready][nginxloadbalancer-nginx-verify-nginx] - [](){ #nginxloadbalancer-nginx-build } ## Build Nginx Container From 8a57872b2ac9b01004ae1d3a3a689de218ea5be5 Mon Sep 17 00:00:00 2001 From: Tyler Michael Smith Date: Mon, 2 Jun 2025 23:36:51 -0400 Subject: [PATCH 004/115] [Bugfix][EP+DP] Use pplx-kernel internode instead of intranode (#19034) Signed-off-by: Tyler Michael Smith Signed-off-by: Tyler Michael Smith --- vllm/distributed/device_communicators/all2all.py | 4 ++++ vllm/model_executor/layers/fused_moe/layer.py | 6 +++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/vllm/distributed/device_communicators/all2all.py b/vllm/distributed/device_communicators/all2all.py index a250ec89cd5ba..7177754a37115 100644 --- a/vllm/distributed/device_communicators/all2all.py +++ b/vllm/distributed/device_communicators/all2all.py @@ -83,6 +83,10 @@ class PPLXAll2AllManager(All2AllManagerBase): assert has_pplx, "pplx_kernels not found. Please follow https://github.com/vllm-project/vllm/blob/main/tools/ep_kernels/README.md to install pplx_kernels." # noqa super().__init__(cpu_group) + # TODO(tms): Disable pplx-a2a intranode as it fails with the error: + # failed: cuda error /app/pplx/csrc/all_to_all/intranode.cpp:84 'invalid resource handle' # noqa + self.internode = True + if self.internode: # inter-node communication needs nvshmem, # intra-node communication uses p2p mapping directly diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index af7b98e14c6c8..1e193c909f617 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -269,9 +269,13 @@ class FusedMoEMethodBase(QuantizeMethodBase): hidden_dim_scale_bytes=(0 if moe.in_dtype.itemsize != 1 else ( (moe.hidden_dim + moe.block_size - 1) // moe.block_size * torch.float32.itemsize)), - group_name=all2all_manager.cpu_group.group_name, ) + # Intranode pplx a2a takes a group name while internode does not. + if not all2all_manager.internode: + all_to_all_args[ + "group_name"] = all2all_manager.cpu_group.group_name + handle = all2all_manager.get_handle(all_to_all_args) prepare_finalize = PplxPrepareAndFinalize( From 4ce42f92042ef8a24e925fc7121f7c98e51f73ba Mon Sep 17 00:00:00 2001 From: Concurrensee Date: Mon, 2 Jun 2025 22:46:44 -0500 Subject: [PATCH 005/115] Adding "LoRA Test %N" to AMD production tests (#18929) Signed-off-by: Yida Wu --- .buildkite/scripts/hardware_ci/run-amd-test.sh | 4 ++++ .buildkite/test-pipeline.yaml | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh index bbc896ec68190..6e9af1e721bb7 100755 --- a/.buildkite/scripts/hardware_ci/run-amd-test.sh +++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh @@ -94,6 +94,10 @@ if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"} fi +if [[ $commands == *"pytest -v -s lora"* ]]; then + commands=${commands//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"} +fi + #ignore certain kernels tests if [[ $commands == *" kernels/core"* ]]; then commands="${commands} \ diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index bff2f69c17ba7..5fb8ceaace05d 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -287,7 +287,7 @@ steps: - pytest -v -s spec_decode/e2e/test_eagle_correctness.py - label: LoRA Test %N # 15min each - mirror_hardwares: [amdexperimental] + mirror_hardwares: [amdexperimental, amdproduction] source_file_dependencies: - vllm/lora - tests/lora From 8655f47f37750eb5d00992d39305d6705659983f Mon Sep 17 00:00:00 2001 From: "Li, Jiang" Date: Tue, 3 Jun 2025 11:46:47 +0800 Subject: [PATCH 006/115] [CPU][CI] Re-enable the CPU CI tests (#19046) Signed-off-by: jiang.li --- .../scripts/hardware_ci/run-cpu-test.sh | 42 +++++++++---------- docker/Dockerfile.cpu | 10 +++-- vllm/distributed/parallel_state.py | 3 +- 3 files changed, 29 insertions(+), 26 deletions(-) diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh index 40f3df96065d1..0a11935607e2a 100644 --- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh @@ -8,67 +8,65 @@ set -ex CORE_RANGE=${CORE_RANGE:-48-95} NUMA_NODE=${NUMA_NODE:-1} +export CMAKE_BUILD_PARALLEL_LEVEL=32 + # Setup cleanup remove_docker_container() { set -e; - docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; - docker image rm cpu-test-"$BUILDKITE_BUILD_NUMBER" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 || true; + docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true; } trap remove_docker_container EXIT remove_docker_container # Try building the docker image -numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$BUILDKITE_BUILD_NUMBER" --target vllm-test -f docker/Dockerfile.cpu . -numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 --target vllm-test -f docker/Dockerfile.cpu . +numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu . +numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu . # Run the image, setting --shm-size=4g for tensor parallel. docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \ - --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER" + --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE" docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \ - --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 + --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2 function cpu_tests() { set -e export NUMA_NODE=$2 - export BUILDKITE_BUILD_NUMBER=$3 # offline inference - docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c " + docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c " set -e python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" # Run basic model test - docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c " + docker exec cpu-test-"$NUMA_NODE" bash -c " set -e - pytest -v -s tests/kernels/test_cache.py -m cpu_model - pytest -v -s tests/kernels/test_mla_decode_cpu.py -m cpu_model - pytest -v -s tests/models/decoder_only/language -m cpu_model - pytest -v -s tests/models/embedding/language -m cpu_model - pytest -v -s tests/models/encoder_decoder/language -m cpu_model - pytest -v -s tests/models/decoder_only/audio_language -m cpu_model - pytest -v -s tests/models/decoder_only/vision_language -m cpu_model" + pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model + pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model + pytest -v -s tests/models/language/generation -m cpu_model + pytest -v -s tests/models/language/pooling -m cpu_model + pytest -v -s tests/models/multimodal/generation --ignore=tests/models/multimodal/generation/test_mllama.py -m cpu_model" # Run compressed-tensor test - docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c " + docker exec cpu-test-"$NUMA_NODE" bash -c " set -e pytest -s -v \ tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \ tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token" # Run AWQ test - docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c " + docker exec cpu-test-"$NUMA_NODE" bash -c " set -e pytest -s -v \ tests/quantization/test_ipex_quant.py" # Run chunked-prefill and prefix-cache test - docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c " + docker exec cpu-test-"$NUMA_NODE" bash -c " set -e pytest -s -v -k cpu_model \ tests/basic_correctness/test_chunked_prefill.py" # online serving - docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c " + docker exec cpu-test-"$NUMA_NODE" bash -c " set -e export VLLM_CPU_KVCACHE_SPACE=10 export VLLM_CPU_OMP_THREADS_BIND=$1 @@ -83,7 +81,7 @@ function cpu_tests() { --tokenizer facebook/opt-125m" # Run multi-lora tests - docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c " + docker exec cpu-test-"$NUMA_NODE" bash -c " set -e pytest -s -v \ tests/lora/test_qwen2vl.py" @@ -91,4 +89,4 @@ function cpu_tests() { # All of CPU tests are expected to be finished less than 40 mins. export -f cpu_tests -timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE $BUILDKITE_BUILD_NUMBER" +timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE" diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu index 5395b3884fb52..6db2f307a3800 100644 --- a/docker/Dockerfile.cpu +++ b/docker/Dockerfile.cpu @@ -75,6 +75,7 @@ RUN --mount=type=bind,source=.git,target=.git \ RUN --mount=type=cache,target=/root/.cache/uv \ --mount=type=cache,target=/root/.cache/ccache \ + --mount=type=cache,target=/workspace/vllm/.deps,sharing=locked \ --mount=type=bind,source=.git,target=.git \ VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel @@ -85,7 +86,7 @@ WORKDIR /workspace/vllm RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ --mount=type=cache,target=/var/lib/apt,sharing=locked \ - apt-get install -y --no-install-recommends vim numactl + apt-get install -y --no-install-recommends vim numactl xz-utils # install development dependencies (for testing) RUN --mount=type=cache,target=/root/.cache/uv \ @@ -108,8 +109,11 @@ FROM base AS vllm-test WORKDIR /workspace/ RUN --mount=type=cache,target=/root/.cache/uv \ - --mount=type=bind,src=requirements/test.txt,target=requirements/test.txt \ - uv pip install -r requirements/test.txt + --mount=type=bind,src=requirements/test.in,target=requirements/test.in \ + cp requirements/test.in requirements/test-cpu.in && \ + sed -i '/mamba_ssm/d' requirements/test-cpu.in && \ + uv pip compile requirements/test-cpu.in -o requirements/cpu-test.txt && \ + uv pip install -r requirements/cpu-test.txt RUN --mount=type=cache,target=/root/.cache/uv \ --mount=type=bind,from=vllm-build,src=/workspace/vllm/dist,target=dist \ diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 6e48c02da6692..32c9301bf23d3 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -1203,7 +1203,8 @@ def cleanup_dist_env_and_memory(shutdown_ray: bool = False): if empty_cache is not None: empty_cache() try: - torch._C._host_emptyCache() + if not current_platform.is_cpu(): + torch._C._host_emptyCache() except AttributeError: logger.warning( "torch._C._host_emptyCache() only available in Pytorch >=2.5") From 9e6f61e8c3df833537e4bea6c33f85eca5d73b15 Mon Sep 17 00:00:00 2001 From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com> Date: Mon, 2 Jun 2025 23:47:47 -0400 Subject: [PATCH 007/115] [ROCm][Build] Clean up the ROCm build (#19040) Signed-off-by: Gregory Shtrasberg --- CMakeLists.txt | 4 ---- docker/Dockerfile.rocm | 17 ----------------- .../installation/gpu/rocm.inc.md | 2 -- requirements/rocm.txt | 2 ++ 4 files changed, 2 insertions(+), 23 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6536e9a57f6e7..87aa23c080f50 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -182,9 +182,6 @@ include(FetchContent) file(MAKE_DIRECTORY ${FETCHCONTENT_BASE_DIR}) # Ensure the directory exists message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}") -# -# Set rocm version dev int. -# if(VLLM_GPU_LANG STREQUAL "HIP") # # Overriding the default -O set up by cmake, adding ggdb3 for the most verbose devug info @@ -192,7 +189,6 @@ if(VLLM_GPU_LANG STREQUAL "HIP") set(CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG "${CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG} -O0 -ggdb3") set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -ggdb3") - # # Certain HIP functions are marked as [[nodiscard]], yet vllm ignores the result which generates # a lot of warnings that always mask real issues. Suppressing until this is properly addressed. diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm index e60cf5e69a4c4..b186f88d27443 100644 --- a/docker/Dockerfile.rocm +++ b/docker/Dockerfile.rocm @@ -1,7 +1,5 @@ # default base image ARG REMOTE_VLLM="0" -ARG USE_CYTHON="0" -ARG BUILD_RPD="1" ARG COMMON_WORKDIR=/app ARG BASE_IMAGE=rocm/vllm-dev:base @@ -36,12 +34,10 @@ FROM fetch_vllm_${REMOTE_VLLM} AS fetch_vllm # ----------------------- # vLLM build stages FROM fetch_vllm AS build_vllm -ARG USE_CYTHON # Build vLLM RUN cd vllm \ && python3 -m pip install -r requirements/rocm.txt \ && python3 setup.py clean --all \ - && if [ ${USE_CYTHON} -eq "1" ]; then python3 tests/build_cython.py build_ext --inplace; fi \ && python3 setup.py bdist_wheel --dist-dir=dist FROM scratch AS export_vllm ARG COMMON_WORKDIR @@ -90,13 +86,6 @@ RUN case "$(which python3)" in \ *) ;; esac RUN python3 -m pip install --upgrade huggingface-hub[cli] -ARG BUILD_RPD -RUN if [ ${BUILD_RPD} -eq "1" ]; then \ - git clone -b nvtx_enabled https://github.com/ROCm/rocmProfileData.git \ - && cd rocmProfileData/rpd_tracer \ - && pip install -r requirements.txt && cd ../ \ - && make && make install \ - && cd hipMarker && python3 setup.py install ; fi # Install vLLM RUN --mount=type=bind,from=export_vllm,src=/,target=/install \ @@ -117,12 +106,6 @@ ENV TOKENIZERS_PARALLELISM=false # ENV that can improve safe tensor loading, and end-to-end time ENV SAFETENSORS_FAST_GPU=1 -# User-friendly environment setting for multi-processing to avoid below RuntimeError. -# RuntimeError: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, -# you must use the 'spawn' start method -# See https://pytorch.org/docs/stable/notes/multiprocessing.html#cuda-in-multiprocessing -ENV VLLM_WORKER_MULTIPROC_METHOD=spawn - # Performance environment variable. ENV HIP_FORCE_DEV_KERNARG=1 diff --git a/docs/getting_started/installation/gpu/rocm.inc.md b/docs/getting_started/installation/gpu/rocm.inc.md index 0029b3a244968..8b7dc6dd09d34 100644 --- a/docs/getting_started/installation/gpu/rocm.inc.md +++ b/docs/getting_started/installation/gpu/rocm.inc.md @@ -179,8 +179,6 @@ It is important that the user kicks off the docker build using buildkit. Either It provides flexibility to customize the build of docker image using the following arguments: - `BASE_IMAGE`: specifies the base image used when running `docker build`. The default value `rocm/vllm-dev:base` is an image published and maintained by AMD. It is being built using -- `USE_CYTHON`: An option to run cython compilation on a subset of python files upon docker build -- `BUILD_RPD`: Include RocmProfileData profiling tool in the image - `ARG_PYTORCH_ROCM_ARCH`: Allows to override the gfx architecture values from the base docker image Their values can be passed in when running `docker build` with `--build-arg` options. diff --git a/requirements/rocm.txt b/requirements/rocm.txt index 8a84f2ff1ed01..fb1febdac5067 100644 --- a/requirements/rocm.txt +++ b/requirements/rocm.txt @@ -12,5 +12,7 @@ ray>=2.10.0,<2.45.0 peft pytest-asyncio tensorizer>=2.9.0 +setuptools-scm>=8 +setuptools>=77.0.3,<80.0.0 runai-model-streamer==0.11.0 runai-model-streamer-s3==0.11.0 From bdce64f2365b39335141f8efcb3a0a8ecc559153 Mon Sep 17 00:00:00 2001 From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com> Date: Mon, 2 Jun 2025 21:15:13 -0700 Subject: [PATCH 008/115] [V1] Support DP with Ray (#18779) --- requirements/test.in | 2 +- requirements/test.txt | 50 +++++++ tests/v1/test_async_llm_dp.py | 13 +- vllm/config.py | 6 + vllm/engine/arg_utils.py | 29 +++- vllm/entrypoints/cli/serve.py | 35 ++++- vllm/v1/engine/async_llm.py | 13 +- vllm/v1/engine/core.py | 180 ++++++++++++++++------- vllm/v1/engine/core_client.py | 74 ++++++++-- vllm/v1/utils.py | 269 ++++++++++++++++++++++++++++------ 10 files changed, 551 insertions(+), 120 deletions(-) diff --git a/requirements/test.in b/requirements/test.in index e906752ff875b..9b574a09fcce5 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -17,7 +17,7 @@ vector_quantize_pytorch # required for minicpmo_26 test vocos # required for minicpmo_26 test peft pqdm -ray[cgraph]>=2.43.0, !=2.44.* # Ray Compiled Graph, required by pipeline parallelism tests +ray[cgraph,default]>=2.43.0, !=2.44.* # Ray Compiled Graph, required by pipeline parallelism tests sentence-transformers # required for embedding tests soundfile # required for audio tests jiwer # required for audio tests diff --git a/requirements/test.txt b/requirements/test.txt index 60dcaca816a2b..03aec80ac1283 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -10,9 +10,13 @@ aiohappyeyeballs==2.4.3 # via aiohttp aiohttp==3.10.11 # via + # aiohttp-cors # datasets # fsspec # lm-eval + # ray +aiohttp-cors==0.8.1 + # via ray aiosignal==1.3.1 # via # aiohttp @@ -57,6 +61,8 @@ bounded-pool-executor==0.0.3 # via pqdm buildkite-test-collector==0.1.9 # via -r requirements/test.in +cachetools==5.5.2 + # via google-auth certifi==2024.8.30 # via # httpcore @@ -81,6 +87,8 @@ colorama==0.4.6 # sacrebleu # schemathesis # tqdm-multiprocess +colorful==0.5.6 + # via ray contourpy==1.3.0 # via matplotlib cramjam==2.9.0 @@ -108,6 +116,8 @@ dill==0.3.8 # evaluate # lm-eval # multiprocess +distlib==0.3.9 + # via virtualenv dnspython==2.7.0 # via email-validator docopt==0.6.2 @@ -143,6 +153,7 @@ filelock==3.16.1 # ray # torch # transformers + # virtualenv fonttools==4.54.1 # via matplotlib fqdn==1.5.1 @@ -165,8 +176,16 @@ genai-perf==0.0.8 # via -r requirements/test.in genson==1.3.0 # via datamodel-code-generator +google-api-core==2.24.2 + # via opencensus +google-auth==2.40.2 + # via google-api-core +googleapis-common-protos==1.70.0 + # via google-api-core graphql-core==3.2.6 # via hypothesis-graphql +grpcio==1.71.0 + # via ray h11==0.14.0 # via httpcore harfile==0.3.0 @@ -392,6 +411,10 @@ nvidia-nvjitlink-cu12==12.8.61 # torch nvidia-nvtx-cu12==12.8.55 # via torch +opencensus==0.11.4 + # via ray +opencensus-context==0.1.3 + # via opencensus opencv-python-headless==4.11.0.86 # via # -r requirements/test.in @@ -445,6 +468,7 @@ platformdirs==4.3.6 # via # black # pooch + # virtualenv plotly==5.24.1 # via genai-perf pluggy==1.5.0 @@ -457,10 +481,17 @@ portalocker==2.10.1 # via sacrebleu pqdm==0.2.0 # via -r requirements/test.in +prometheus-client==0.22.0 + # via ray propcache==0.2.0 # via yarl +proto-plus==1.26.1 + # via google-api-core protobuf==5.28.3 # via + # google-api-core + # googleapis-common-protos + # proto-plus # ray # tensorizer psutil==6.1.0 @@ -470,10 +501,18 @@ psutil==6.1.0 # tensorizer py==1.11.0 # via pytest-forked +py-spy==0.4.0 + # via ray pyarrow==18.0.0 # via # datasets # genai-perf +pyasn1==0.6.1 + # via + # pyasn1-modules + # rsa +pyasn1-modules==0.4.2 + # via google-auth pybind11==2.13.6 # via lm-eval pycparser==2.22 @@ -486,6 +525,7 @@ pydantic==2.11.5 # datamodel-code-generator # mistral-common # mteb + # ray pydantic-core==2.33.2 # via pydantic pygments==2.18.0 @@ -573,6 +613,7 @@ requests==2.32.3 # buildkite-test-collector # datasets # evaluate + # google-api-core # huggingface-hub # lm-eval # mistral-common @@ -601,6 +642,8 @@ rpds-py==0.20.1 # via # jsonschema # referencing +rsa==4.9.1 + # via google-auth runai-model-streamer==0.11.0 # via -r requirements/test.in runai-model-streamer-s3==0.11.0 @@ -648,9 +691,12 @@ shellingham==1.5.4 six==1.16.0 # via # junit-xml + # opencensus # python-dateutil # rfc3339-validator # rouge-score +smart-open==7.1.0 + # via ray sniffio==1.3.1 # via # anyio @@ -801,6 +847,8 @@ urllib3==2.2.3 # tritonclient vector-quantize-pytorch==1.21.2 # via -r requirements/test.in +virtualenv==20.31.2 + # via ray vocos==0.1.0 # via -r requirements/test.in webcolors==24.11.1 @@ -809,6 +857,8 @@ werkzeug==3.1.3 # via schemathesis word2number==1.1 # via lm-eval +wrapt==1.17.2 + # via smart-open xxhash==3.5.0 # via # datasets diff --git a/tests/v1/test_async_llm_dp.py b/tests/v1/test_async_llm_dp.py index ce4c4d198db58..366fa3b2561fd 100644 --- a/tests/v1/test_async_llm_dp.py +++ b/tests/v1/test_async_llm_dp.py @@ -59,14 +59,22 @@ async def generate(engine: AsyncLLM, @pytest.mark.parametrize( - "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY]) + "output_kind", + [ + RequestOutputKind.DELTA, + RequestOutputKind.FINAL_ONLY, + ], +) +@pytest.mark.parametrize("data_parallel_backend", ["mp", "ray"]) @pytest.mark.asyncio -async def test_load(output_kind: RequestOutputKind): +async def test_load(output_kind: RequestOutputKind, + data_parallel_backend: str): with ExitStack() as after: prompt = "This is a test of data parallel" + engine_args.data_parallel_backend = data_parallel_backend engine = AsyncLLM.from_engine_args(engine_args) after.callback(engine.shutdown) @@ -82,7 +90,6 @@ async def test_load(output_kind: RequestOutputKind): asyncio.create_task( generate(engine, request_id, prompt, output_kind, NUM_EXPECTED_TOKENS))) - # Confirm that we got all the EXPECTED tokens from the requests. done, pending = await asyncio.wait(tasks, return_when=asyncio.FIRST_EXCEPTION) diff --git a/vllm/config.py b/vllm/config.py index 1bd53e35b0532..8aa1b56103004 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1742,6 +1742,8 @@ class ParallelConfig: """Port for data parallel messaging.""" data_parallel_master_port: int = 29500 """Port of the data parallel master.""" + data_parallel_backend: str = "mp" + """Backend to use for data parallel, either "mp" or "ray".""" enable_expert_parallel: bool = False """Use expert parallelism instead of tensor parallelism for MoE layers.""" max_parallel_loading_workers: Optional[int] = None @@ -1911,6 +1913,10 @@ class ParallelConfig: "please install Ray with `pip install " "ray`.") from ray_utils.ray_import_err backend = "ray" + elif self.data_parallel_backend == "ray": + logger.info("Using ray distributed inference because " + "data_parallel_backend is ray") + backend = "ray" elif ray_found: if self.placement_group: backend = "ray" diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 299c8347f458a..a5b155024b73a 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -39,7 +39,7 @@ from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET, MODELS_ON_S3 from vllm.transformers_utils.utils import check_gguf_file from vllm.usage.usage_lib import UsageContext from vllm.utils import (STR_DUAL_CHUNK_FLASH_ATTN_VAL, FlexibleArgumentParser, - GiB_bytes, is_in_ray_actor) + GiB_bytes, get_ip, is_in_ray_actor) # yapf: enable @@ -292,6 +292,7 @@ class EngineArgs: data_parallel_size_local: Optional[int] = None data_parallel_address: Optional[str] = None data_parallel_rpc_port: Optional[int] = None + data_parallel_backend: str = ParallelConfig.data_parallel_backend enable_expert_parallel: bool = ParallelConfig.enable_expert_parallel max_parallel_loading_workers: Optional[ int] = ParallelConfig.max_parallel_loading_workers @@ -624,6 +625,12 @@ class EngineArgs: type=int, help='Port for data parallel RPC ' 'communication.') + parallel_group.add_argument('--data-parallel-backend', + '-dpb', + type=str, + default='mp', + help='Backend for data parallel, either ' + '"mp" or "ray".') parallel_group.add_argument( "--enable-expert-parallel", **parallel_kwargs["enable_expert_parallel"]) @@ -1059,9 +1066,20 @@ class EngineArgs: # DP address, used in multi-node case for torch distributed group # and ZMQ sockets. - data_parallel_address = self.data_parallel_address if ( - self.data_parallel_address - is not None) else ParallelConfig.data_parallel_master_ip + if self.data_parallel_address is None: + if self.data_parallel_backend == "ray": + host_ip = get_ip() + logger.info( + "Using host IP %s as ray-based data parallel address", + host_ip) + data_parallel_address = host_ip + else: + assert self.data_parallel_backend == "mp", ( + "data_parallel_backend can only be ray or mp, got %s", + self.data_parallel_backend) + data_parallel_address = ParallelConfig.data_parallel_master_ip + else: + data_parallel_address = self.data_parallel_address # This port is only used when there are remote data parallel engines, # otherwise the local IPC transport is used. @@ -1069,6 +1087,8 @@ class EngineArgs: self.data_parallel_rpc_port is not None) else ParallelConfig.data_parallel_rpc_port + data_parallel_backend = self.data_parallel_backend + parallel_config = ParallelConfig( pipeline_parallel_size=self.pipeline_parallel_size, tensor_parallel_size=self.tensor_parallel_size, @@ -1076,6 +1096,7 @@ class EngineArgs: data_parallel_size_local=data_parallel_size_local, data_parallel_master_ip=data_parallel_address, data_parallel_rpc_port=data_parallel_rpc_port, + data_parallel_backend=data_parallel_backend, enable_expert_parallel=self.enable_expert_parallel, max_parallel_loading_workers=self.max_parallel_loading_workers, disable_custom_all_reduce=self.disable_custom_all_reduce, diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py index e65c97073218b..040ae166a2d5f 100644 --- a/vllm/entrypoints/cli/serve.py +++ b/vllm/entrypoints/cli/serve.py @@ -27,7 +27,8 @@ from vllm.v1.engine.core_client import CoreEngineProcManager from vllm.v1.executor.abstract import Executor from vllm.v1.metrics.prometheus import setup_multiprocess_prometheus from vllm.v1.utils import (APIServerProcessManager, CoreEngine, - EngineZmqAddresses, get_engine_client_zmq_addr, + CoreEngineActorManager, EngineZmqAddresses, + get_engine_client_zmq_addr, wait_for_completion_or_failure, wait_for_engine_startup) @@ -229,6 +230,31 @@ def run_multi_api_server(args: argparse.Namespace): logger.info("Started DP Coordinator process (PID: %d)", coordinator.proc.pid) + if parallel_config.data_parallel_backend == "ray": + logger.info("Starting ray-based data parallel backend") + + engine_actor_manager = CoreEngineActorManager( + vllm_config=vllm_config, + addresses=addresses, + executor_class=Executor.get_class(vllm_config), + log_stats=not engine_args.disable_log_stats, + ) + # Start API servers using the manager + api_server_manager = APIServerProcessManager( + target_server_fn=run_api_server_worker_proc, + listen_address=listen_address, + sock=sock, + args=args, + num_servers=num_api_servers, + input_addresses=input_addresses, + output_addresses=output_addresses, + stats_update_address=stats_update_address) + + wait_for_completion_or_failure(api_server_manager=api_server_manager, + engine_manager=engine_actor_manager, + coordinator=coordinator) + return + handshake_address = get_engine_client_zmq_addr( local_only, host, parallel_config.data_parallel_rpc_port) @@ -277,10 +303,9 @@ def run_multi_api_server(args: argparse.Namespace): ) # Wait for API servers - wait_for_completion_or_failure( - api_server_manager=api_server_manager, - local_engine_manager=local_engine_manager, - coordinator=coordinator) + wait_for_completion_or_failure(api_server_manager=api_server_manager, + engine_manager=local_engine_manager, + coordinator=coordinator) def run_api_server_worker_proc(listen_address, diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 86781e7528fa3..4b235c596ed6d 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -27,7 +27,8 @@ from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs from vllm.usage.usage_lib import UsageContext from vllm.utils import Device, cdiv from vllm.v1.engine import EngineCoreRequest -from vllm.v1.engine.core_client import AsyncMPClient, DPAsyncMPClient +from vllm.v1.engine.core_client import (AsyncMPClient, DPAsyncMPClient, + RayDPClient) from vllm.v1.engine.exceptions import EngineDeadError, EngineGenerateError from vllm.v1.engine.output_processor import (OutputProcessor, RequestOutputCollector) @@ -119,9 +120,13 @@ class AsyncLLM(EngineClient): log_stats=self.log_stats) # EngineCore (starts the engine in background process). - core_client_class = AsyncMPClient if ( - vllm_config.parallel_config.data_parallel_size - == 1) else DPAsyncMPClient + core_client_class: type[AsyncMPClient] + if vllm_config.parallel_config.data_parallel_size == 1: + core_client_class = AsyncMPClient + elif vllm_config.parallel_config.data_parallel_backend == "ray": + core_client_class = RayDPClient + else: + core_client_class = DPAsyncMPClient self.engine_core = core_client_class( vllm_config=vllm_config, diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index a02abb62b1f36..7253d1dc66d1f 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -6,8 +6,9 @@ import sys import threading import time from collections import deque +from collections.abc import Generator from concurrent.futures import Future -from contextlib import ExitStack +from contextlib import ExitStack, contextmanager from inspect import isclass, signature from logging import DEBUG from typing import Any, Callable, Optional, TypeVar, Union @@ -367,42 +368,66 @@ class EngineCoreProc(EngineCore): log_stats: bool, engine_index: int = 0, ): - input_queue = queue.Queue[tuple[EngineCoreRequestType, Any]]() - - executor_fail_callback = lambda: input_queue.put_nowait( + self.input_queue = queue.Queue[tuple[EngineCoreRequestType, Any]]() + self.output_queue = queue.Queue[Union[tuple[int, EngineCoreOutputs], + bytes]]() + executor_fail_callback = lambda: self.input_queue.put_nowait( (EngineCoreRequestType.EXECUTOR_FAILED, b'')) - # Create input socket. + self.engine_index = engine_index + identity = self.engine_index.to_bytes(length=2, byteorder="little") + self.engines_running = False + + with self._perform_handshake(handshake_address, identity, on_head_node, + vllm_config) as addresses: + self.client_count = len(addresses.outputs) + + # Set up data parallel environment. + self.has_coordinator = addresses.coordinator_output is not None + self._init_data_parallel(vllm_config) + + super().__init__(vllm_config, executor_class, log_stats, + executor_fail_callback) + + self.step_fn = (self.step if self.batch_queue is None else + self.step_with_batch_queue) + + # Background Threads and Queues for IO. These enable us to + # overlap ZMQ socket IO with GPU since they release the GIL, + # and to overlap some serialization/deserialization with the + # model forward pass. + # Threads handle Socket <-> Queues and core_busy_loop uses Queue. + threading.Thread(target=self.process_input_sockets, + args=(addresses.inputs, addresses.coordinator_input, + identity), + daemon=True).start() + self.output_thread = threading.Thread( + target=self.process_output_sockets, + args=(addresses.outputs, addresses.coordinator_output, + self.engine_index), + daemon=True) + self.output_thread.start() + + @contextmanager + def _perform_handshake( + self, handshake_address: str, identity: bytes, on_head_node: bool, + vllm_config: VllmConfig + ) -> Generator[EngineZmqAddresses, None, None]: input_ctx = zmq.Context() - identity = engine_index.to_bytes(length=2, byteorder="little") with make_zmq_socket(input_ctx, handshake_address, zmq.DEALER, identity=identity, linger=5000, bind=False) as handshake_socket: - # Register engine with front-end. addresses = self.startup_handshake(handshake_socket, on_head_node, vllm_config.parallel_config) - self.client_count = len(addresses.outputs) - # Update config which may have changed from the handshake. + # Update config which may have changed from the handshake vllm_config.__post_init__() - # Set up data parallel environment. - self.has_coordinator = addresses.coordinator_output is not None - self._init_data_parallel(vllm_config) - - # Initialize engine core and model. - super().__init__(vllm_config, executor_class, log_stats, - executor_fail_callback) - - self.engine_index = engine_index - self.step_fn = (self.step if self.batch_queue is None else - self.step_with_batch_queue) - self.engines_running = False - self.last_counts = (0, 0) + yield addresses # Send ready message. num_gpu_blocks = vllm_config.cache_config.num_gpu_blocks @@ -413,25 +438,6 @@ class EngineCoreProc(EngineCore): "num_gpu_blocks": num_gpu_blocks, })) - # Background Threads and Queues for IO. These enable us to - # overlap ZMQ socket IO with GPU since they release the GIL, - # and to overlap some serialization/deserialization with the - # model forward pass. - # Threads handle Socket <-> Queues and core_busy_loop uses Queue. - self.input_queue = input_queue - self.output_queue = queue.Queue[Union[tuple[int, EngineCoreOutputs], - bytes]]() - threading.Thread(target=self.process_input_sockets, - args=(addresses.inputs, addresses.coordinator_input, - identity), - daemon=True).start() - self.output_thread = threading.Thread( - target=self.process_output_sockets, - args=(addresses.outputs, addresses.coordinator_output, - engine_index), - daemon=True) - self.output_thread.start() - @staticmethod def startup_handshake( handshake_socket: zmq.Socket, on_head_node: bool, @@ -743,6 +749,21 @@ class DPEngineCoreProc(EngineCoreProc): executor_class: type[Executor], log_stats: bool, ): + + self._decorate_logs() + + # Counts forward-passes of the model so that we can synchronize + # finished with DP peers every N steps. + self.counter = 0 + self.current_wave = 0 + self.last_counts = (0, 0) + + # Initialize the engine. + dp_rank = vllm_config.parallel_config.data_parallel_rank + super().__init__(vllm_config, on_head_node, handshake_address, + executor_class, log_stats, dp_rank) + + def _decorate_logs(self): # Add process-specific prefix to stdout and stderr before # we initialize the engine. from multiprocessing import current_process @@ -751,16 +772,6 @@ class DPEngineCoreProc(EngineCoreProc): _add_prefix(sys.stdout, process_name, pid) _add_prefix(sys.stderr, process_name, pid) - # Counts forward-passes of the model so that we can synchronize - # finished with DP peers every N steps. - self.counter = 0 - self.current_wave = 0 - - # Initialize the engine. - dp_rank = vllm_config.parallel_config.data_parallel_rank - super().__init__(vllm_config, on_head_node, handshake_address, - executor_class, log_stats, dp_rank) - def _init_data_parallel(self, vllm_config: VllmConfig): # Configure GPUs and stateless process group for data parallel. @@ -880,3 +891,70 @@ class DPEngineCoreProc(EngineCoreProc): return ParallelConfig.has_unfinished_dp(self.dp_group, local_unfinished) + + +class DPEngineCoreActor(DPEngineCoreProc): + """ + Ray actor for running EngineCore in a data parallel context + """ + + def __init__( + self, + vllm_config: VllmConfig, + on_head_node: bool, + addresses: EngineZmqAddresses, + executor_class: type[Executor], + log_stats: bool, + dp_rank: int = 0, + local_dp_rank: int = 0, + ): + self.addresses = addresses + vllm_config.parallel_config.data_parallel_rank = dp_rank + vllm_config.parallel_config.data_parallel_rank_local = \ + local_dp_rank + + # Ray sets CUDA_VISIBLE_DEVICES to empty string, + # we clean this up to be able to properly initialize + # data parallel groups. + del os.environ['CUDA_VISIBLE_DEVICES'] + + super().__init__(vllm_config, on_head_node, "", executor_class, + log_stats) + + def _decorate_logs(self): + pass + + @contextmanager + def _perform_handshake(self, handshake_address: str, identity: bytes, + on_head_node: bool, vllm_config: VllmConfig): + """ + For Ray, we don't need to actually perform handshake. + All addresses information is known before the actor creation. + Therefore, we simply yield these addresses. + """ + yield self.addresses + + def wait_for_init(self): + """ + Wait until the engine core is initialized. + + This is just an empty method. When ray.get() on this method + (or any other method of the actor) returns, it is guaranteed + that actor creation (i.e., __init__) is complete. + """ + pass + + def run(self): + """ + Run the engine core busy loop. + """ + try: + self.run_busy_loop() + except SystemExit: + logger.debug("EngineCore exiting.") + raise + except Exception: + logger.exception("EngineCore encountered a fatal error.") + raise + finally: + self.shutdown() diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 232d6742b7718..fa01998aa9fe2 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -29,9 +29,9 @@ from vllm.v1.engine.core import EngineCore, EngineCoreProc from vllm.v1.engine.exceptions import EngineDeadError from vllm.v1.executor.abstract import Executor from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder, bytestr -from vllm.v1.utils import (CoreEngine, CoreEngineProcManager, - EngineZmqAddresses, get_engine_client_zmq_addr, - wait_for_engine_startup) +from vllm.v1.utils import (CoreEngine, CoreEngineActorManager, + CoreEngineProcManager, EngineZmqAddresses, + get_engine_client_zmq_addr, wait_for_engine_startup) logger = init_logger(__name__) @@ -68,6 +68,8 @@ class EngineCoreClient(ABC): if multiprocess_mode and asyncio_mode: if vllm_config.parallel_config.data_parallel_size > 1: + if vllm_config.parallel_config.data_parallel_backend == "ray": + return RayDPClient(vllm_config, executor_class, log_stats) return DPAsyncMPClient(vllm_config, executor_class, log_stats) return AsyncMPClient(vllm_config, executor_class, log_stats) @@ -273,7 +275,10 @@ class BackgroundResources: circular reference back to the client object.""" ctx: Union[zmq.Context] - local_engine_manager: Optional[CoreEngineProcManager] = None + # If CoreEngineProcManager, it manages local engines; + # if CoreEngineActorManager, it manages all engines. + engine_manager: Optional[Union[CoreEngineProcManager, + CoreEngineActorManager]] = None coordinator: Optional[DPCoordinator] = None output_socket: Optional[Union[zmq.Socket, zmq.asyncio.Socket]] = None input_socket: Optional[Union[zmq.Socket, zmq.asyncio.Socket]] = None @@ -290,8 +295,8 @@ class BackgroundResources: """Clean up background resources.""" self.engine_dead = True - if self.local_engine_manager is not None: - self.local_engine_manager.close() + if self.engine_manager is not None: + self.engine_manager.close() if self.coordinator is not None: self.coordinator.close() @@ -457,7 +462,7 @@ class MPClient(EngineCoreClient): if local_engine_count: # In server mode, start_index and local_start_index will # both be 0. - self.resources.local_engine_manager = CoreEngineProcManager( + self.resources.engine_manager = CoreEngineProcManager( EngineCoreProc.run_engine_core, vllm_config=vllm_config, executor_class=executor_class, @@ -484,13 +489,18 @@ class MPClient(EngineCoreClient): addresses.coordinator_input, addresses.coordinator_output = ( coordinator.get_engine_socket_addresses()) + proc_manager = self.resources.engine_manager + assert isinstance(proc_manager, (type(None), CoreEngineProcManager)), ( + "_wait_for_engine_startup should only be " + "called with CoreEngineProcManager") + wait_for_engine_startup( handshake_socket, addresses, self.core_engines, self.vllm_config.parallel_config, self.vllm_config.cache_config, - self.resources.local_engine_manager, + proc_manager, coordinator.proc if coordinator else None, ) @@ -887,7 +897,6 @@ class DPAsyncMPClient(AsyncMPClient): log_stats: bool, client_addresses: Optional[dict[str, str]] = None, client_index: int = 0): - self.current_wave = 0 self.engines_running = False # To route aborts to the correct engine. @@ -1050,3 +1059,50 @@ class DPAsyncMPClient(AsyncMPClient): if not self.resources.engine_dead: await self._send_input(EngineCoreRequestType.ABORT, request_ids, engine) + + +class RayDPClient(DPAsyncMPClient): + """ + Ray-based client for multi-proc, multi-engine (data parallel) + EngineCore. + """ + + def __init__( + self, + vllm_config: VllmConfig, + executor_class: type[Executor], + log_stats: bool, + client_addresses: Optional[dict[str, str]] = None, + client_index: int = 0, + ): + super().__init__(vllm_config, executor_class, log_stats, + client_addresses, client_index) + + def _init_engines_direct(self, vllm_config: VllmConfig, local_only: bool, + local_start_index: int, input_address: str, + output_address: str, + executor_class: type[Executor], log_stats: bool): + """Self-contained client mode, launch engine and coordinator process + as needed.""" + + parallel_config = vllm_config.parallel_config + assert parallel_config.data_parallel_rank == 0 + assert local_start_index == 0 + + addresses = EngineZmqAddresses( + inputs=[input_address], + outputs=[output_address], + ) + + if len(self.core_engines) > 1: + coordinator = DPCoordinator(parallel_config) + self.resources.coordinator = coordinator + addresses.coordinator_input, addresses.coordinator_output = ( + coordinator.get_engine_socket_addresses()) + + # Start all engines. + self.resources.engine_manager = CoreEngineActorManager( + vllm_config=vllm_config, + addresses=addresses, + executor_class=executor_class, + log_stats=log_stats) diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py index a26794561a526..d347efc425ef4 100644 --- a/vllm/v1/utils.py +++ b/vllm/v1/utils.py @@ -27,6 +27,8 @@ from vllm.utils import (get_mp_context, get_open_port, get_open_zmq_ipc_path, from vllm.v1.executor.abstract import Executor if TYPE_CHECKING: + from ray.util.placement_group import PlacementGroup + from vllm.attention.layer import Attention from vllm.v1.engine.coordinator import DPCoordinator @@ -112,6 +114,45 @@ def get_engine_client_zmq_addr(local_only: bool, host, port or get_open_port())) +class CoreEngineState(Enum): + NEW = auto() + CONNECTED = auto() + READY = auto() + + +class CoreEngine: + """One per data parallel rank.""" + + def __init__(self, index: int = 0, local: bool = True): + self.local = local + self.index = index + self.identity = index.to_bytes(2, "little") + + self.state = CoreEngineState.NEW + + +@dataclass +class EngineZmqAddresses: + # ZMQ input socket addresses for each front-end client (requests) + inputs: list[str] + # ZMQ output socket addresses for each front-end client (responses) + outputs: list[str] + # ZMQ input socket address of DP coordinator if applicable + coordinator_input: Optional[str] = None + # ZMQ output socket address of DP coordinator if applicable + coordinator_output: Optional[str] = None + + +@dataclass +class EngineHandshakeMetadata: + """Metadata sent to each engine process during startup handshake, + including addresses of the front-end ZMQ queues that they should + connect to. + """ + addresses: EngineZmqAddresses + parallel_config: dict[str, Union[int, str]] + + class APIServerProcessManager: """Manages a group of API server processes. @@ -245,43 +286,168 @@ class CoreEngineProcManager: } -class CoreEngineState(Enum): - NEW = auto() - CONNECTED = auto() - READY = auto() - - -class CoreEngine: - """One per data parallel rank.""" - - def __init__(self, index: int = 0, local: bool = True): - self.local = local - self.index = index - self.identity = index.to_bytes(2, "little") - - self.state = CoreEngineState.NEW - - -@dataclass -class EngineZmqAddresses: - # ZMQ input socket addresses for each front-end client (requests) - inputs: list[str] - # ZMQ output socket addresses for each front-end client (responses) - outputs: list[str] - # ZMQ input socket address of DP coordinator if applicable - coordinator_input: Optional[str] = None - # ZMQ output socket address of DP coordinator if applicable - coordinator_output: Optional[str] = None - - -@dataclass -class EngineHandshakeMetadata: - """Metadata sent to each engine process during startup handshake, - including addresses of the front-end ZMQ queues that they should - connect to. +class CoreEngineActorManager: """ - addresses: EngineZmqAddresses - parallel_config: dict[str, Union[int, str]] + Utility class to handle creation, readiness, and shutdown + of core engine Ray actors used by the AsyncLLM and LLMEngine. + + Different from CoreEngineProcManager, this class manages + core engines for both local and remote nodes. + """ + + def __init__( + self, + vllm_config: VllmConfig, + addresses: EngineZmqAddresses, + executor_class: type[Executor], + log_stats: bool, + placement_groups: Optional[list["PlacementGroup"]] = None, + local_dp_ranks: Optional[list[int]] = None, + ): + import copy + + import ray + from ray.util.scheduling_strategies import ( + PlacementGroupSchedulingStrategy) + + from vllm.v1.engine.core import DPEngineCoreActor + + self.local_engine_actors: list[ray.ActorHandle] = [] + self.remote_engine_actors: list[ray.ActorHandle] = [] + dp_size = vllm_config.parallel_config.data_parallel_size + local_engine_count = \ + vllm_config.parallel_config.data_parallel_size_local + world_size = vllm_config.parallel_config.world_size + + if ray.is_initialized(): + logger.info( + "Ray is already initialized. Skipping Ray initialization.") + else: + ray.init() + + if placement_groups is not None: + assert local_dp_ranks is not None, ( + "local_dp_ranks must be provided if " + "placement_groups is provided") + assert len(placement_groups) == len(local_dp_ranks), ( + "placement_groups and local_dp_ranks must " + "have the same length") + logger.info("Using provided placement groups") + # TODO(rui): validate passed-in placement groups + self.created_placement_groups = [] + else: + placement_groups, local_dp_ranks = \ + CoreEngineActorManager.create_dp_placement_groups(vllm_config) + self.created_placement_groups = placement_groups + assert len(placement_groups) == dp_size, ( + "Number of placement groups must match data parallel size") + + refs = [] + for index in range(dp_size): + local_index = local_dp_ranks[index] + dp_vllm_config = copy.deepcopy(vllm_config) + pg = placement_groups[index] + dp_vllm_config.parallel_config.placement_group = pg + on_head_node = index < local_engine_count + actor = ray.remote(DPEngineCoreActor).options( + scheduling_strategy=PlacementGroupSchedulingStrategy( + placement_group=pg, + placement_group_bundle_index=world_size, + )).remote(vllm_config=dp_vllm_config, + executor_class=executor_class, + log_stats=log_stats, + on_head_node=on_head_node, + addresses=addresses, + dp_rank=index, + local_dp_rank=local_index) + if on_head_node: + self.local_engine_actors.append(actor) + else: + self.remote_engine_actors.append(actor) + refs.append(actor.wait_for_init.remote()) + + ray.get(refs) + self.run_refs = [] + for actor in self.local_engine_actors + self.remote_engine_actors: + self.run_refs.append(actor.run.remote()) + + @staticmethod + def create_dp_placement_groups( + vllm_config: VllmConfig + ) -> tuple[list["PlacementGroup"], list[int]]: + + import ray + from ray._private.state import available_resources_per_node + from ray.util.state import list_nodes + + logger.info("Creating placement groups for data parallel") + dp_master_ip = \ + vllm_config.parallel_config.data_parallel_master_ip + dp_size = vllm_config.parallel_config.data_parallel_size + local_engine_count = \ + vllm_config.parallel_config.data_parallel_size_local + + nodes = list_nodes() + nodes = sorted(list_nodes(), + key=lambda node: node.node_ip != dp_master_ip) + assert nodes[0].node_ip == dp_master_ip, ( + "The first node must be the head node") + assert len(nodes) == 1 or nodes[1].node_ip != dp_master_ip, ( + "There can only be one head node") + + available_resources = available_resources_per_node() + world_size = vllm_config.parallel_config.world_size + placement_groups: list[PlacementGroup] = [] + local_dp_ranks: list[int] = [] + + for node in nodes: + node_ip = node.node_ip + node_resources = available_resources[node.node_id] + # For now, each DP rank can only be assigned to one node + # TODO(rui): support allocating a single DP rank + # to multiple nodes + available_engine_count = node_resources["GPU"] // world_size + if node_ip == dp_master_ip: + assert available_engine_count >= local_engine_count, ( + "Not enough resources to allocate DP ranks " + f"on DP master node {node_ip}") + for i in range(local_engine_count): + bundles = [{ + "GPU": 1.0, + "node:" + dp_master_ip: 0.001 + }] * world_size + [{ + "CPU": 1.0 + }] + pg = ray.util.placement_group( + name=f"dp_rank_{len(placement_groups)}", + strategy="STRICT_PACK", + bundles=bundles, + ) + placement_groups.append(pg) + local_dp_ranks.append(i) + else: + for i in range(available_engine_count): + if len(placement_groups) == dp_size: + break + bundles = [{"GPU": 1.0}] * world_size + [{"CPU": 1.0}] + pg = ray.util.placement_group( + name=f"dp_rank_{len(placement_groups)}", + strategy="STRICT_PACK", + bundles=bundles, + ) + placement_groups.append(pg) + local_dp_ranks.append(i) + return placement_groups, local_dp_ranks + + def get_run_refs(self): + return self.run_refs + + def close(self): + import ray + for actor in self.local_engine_actors + self.remote_engine_actors: + ray.kill(actor) + for pg in self.created_placement_groups: + ray.util.remove_placement_group(pg) def wait_for_engine_startup( @@ -383,11 +549,19 @@ def wait_for_engine_startup( def wait_for_completion_or_failure( api_server_manager: APIServerProcessManager, - local_engine_manager: Optional[CoreEngineProcManager] = None, + engine_manager: Optional[Union[CoreEngineProcManager, + CoreEngineActorManager]] = None, coordinator: Optional["DPCoordinator"] = None) -> None: """Wait for all processes to complete or detect if any fail. Raises an exception if any process exits with a non-zero status. + + Args: + api_server_manager: The manager for API servers. + engine_manager: The manager for engine processes. + If CoreEngineProcManager, it manages local engines; + if CoreEngineActorManager, it manages all engines. + coordinator: The coordinator for data parallel. """ try: @@ -402,14 +576,18 @@ def wait_for_completion_or_failure( if coordinator: sentinel_to_proc[coordinator.proc.sentinel] = coordinator.proc - if local_engine_manager: - for proc in local_engine_manager.processes: + actor_run_refs = [] + if isinstance(engine_manager, CoreEngineProcManager): + for proc in engine_manager.processes: sentinel_to_proc[proc.sentinel] = proc + elif isinstance(engine_manager, CoreEngineActorManager): + actor_run_refs = engine_manager.get_run_refs() # Check if any process terminates - while sentinel_to_proc: + while sentinel_to_proc or actor_run_refs: # Wait for any process to terminate - ready_sentinels: list[Any] = connection.wait(sentinel_to_proc) + ready_sentinels: list[Any] = connection.wait(sentinel_to_proc, + timeout=5) # Process any terminated processes for sentinel in ready_sentinels: @@ -420,6 +598,11 @@ def wait_for_completion_or_failure( raise RuntimeError( f"Process {proc.name} (PID: {proc.pid}) " f"died with exit code {proc.exitcode}") + + if actor_run_refs: + import ray + _, actor_run_refs = ray.wait(actor_run_refs, timeout=5) + except KeyboardInterrupt: logger.info("Received KeyboardInterrupt, shutting down API servers...") except Exception as e: @@ -431,8 +614,8 @@ def wait_for_completion_or_failure( api_server_manager.close() if coordinator: coordinator.close() - if local_engine_manager: - local_engine_manager.close() + if engine_manager: + engine_manager.close() # Note(rob): shutdown function cannot be a bound method, From 1282bd812ea4e1511378bad5b918d609280d2b89 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B1=AA=E5=BF=97=E9=B9=8F?= Date: Tue, 3 Jun 2025 13:13:13 +0800 Subject: [PATCH 009/115] Add tarsier model support (#18985) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 汪志鹏 --- docs/models/supported_models.md | 1 + examples/offline_inference/vision_language.py | 20 + .../vision_language_multi_image.py | 21 + .../multimodal/processing/test_common.py | 1 + tests/models/registry.py | 2 + vllm/model_executor/models/registry.py | 1 + vllm/model_executor/models/tarsier.py | 643 ++++++++++++++++++ 7 files changed, 689 insertions(+) create mode 100644 vllm/model_executor/models/tarsier.py diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index b60fefdda2793..f2090fe3971e9 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -550,6 +550,7 @@ Specified using `--task generate`. | `Qwen2_5OmniThinkerForConditionalGeneration` | Qwen2.5-Omni | T + IE+ + VE+ + A+ | `Qwen/Qwen2.5-Omni-7B` | | ✅︎ | ✅︎\* | | `SkyworkR1VChatModel` | Skywork-R1V-38B | T + I | `Skywork/Skywork-R1V-38B` | | ✅︎ | ✅︎ | | `SmolVLMForConditionalGeneration` | SmolVLM2 | T + I | `SmolVLM2-2.2B-Instruct` | ✅︎ | | ✅︎ | +| `TarsierForConditionalGeneration` | Tarsier | T + IE+ | `omni-search/Tarsier-7b`,`omni-search/Tarsier-34b` | | ✅︎ | ✅︎ | ^ You need to set the architecture name via `--hf-overrides` to match the one in vLLM.     • For example, to use DeepSeek-VL2 series models: diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index f0504501639d2..2ef87f4f4696e 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -333,6 +333,25 @@ def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData: ) +# omni-research/Tarsier-7b +def run_tarsier(questions: list[str], modality: str) -> ModelRequestData: + assert modality == "image" + model_name = "omni-research/Tarsier-7b" + + engine_args = EngineArgs( + model=model_name, + trust_remote_code=True, + max_model_len=4096, + limit_mm_per_prompt={modality: 1}, + ) + prompts = [(f"USER: \n{question} ASSISTANT:") for question in questions] + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) + + # InternVL def run_internvl(questions: list[str], modality: str) -> ModelRequestData: model_name = "OpenGVLab/InternVL3-2B" @@ -1091,6 +1110,7 @@ model_example_map = { "qwen2_5_omni": run_qwen2_5_omni, "skywork_chat": run_skyworkr1v, "smolvlm": run_smolvlm, + "tarsier": run_tarsier, } diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py index e776ff7fe6aec..7ce28c5a4f09f 100644 --- a/examples/offline_inference/vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -691,6 +691,26 @@ def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData: ) +def load_tarsier(question: str, image_urls: list[str]) -> ModelRequestData: + model_name = "omni-research/Tarsier-7b" + + engine_args = EngineArgs( + model=model_name, + trust_remote_code=True, + max_model_len=4096, + limit_mm_per_prompt={"image": len(image_urls)}, + ) + + prompt = f"USER: {'' * len(image_urls)}\n{question}\n ASSISTANT:" + image_data = [fetch_image(url) for url in image_urls] + + return ModelRequestData( + engine_args=engine_args, + prompt=prompt, + image_data=image_data, + ) + + model_example_map = { "aria": load_aria, "aya_vision": load_aya_vision, @@ -712,6 +732,7 @@ model_example_map = { "qwen2_vl": load_qwen2_vl, "qwen2_5_vl": load_qwen2_5_vl, "smolvlm": load_smolvlm, + "tarsier": load_tarsier, } diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index d7f950c23d954..2377fef820ed1 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -282,6 +282,7 @@ def _test_processing_correctness_one( "Skywork/Skywork-R1V-38B", "fixie-ai/ultravox-v0_5-llama-3_2-1b", "openai/whisper-large-v3", + "omni-research/Tarsier-7b", ]) @pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0]) @pytest.mark.parametrize("num_batches", [32]) diff --git a/tests/models/registry.py b/tests/models/registry.py index fe49d2427c744..182a9668ebef1 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -406,6 +406,8 @@ _MULTIMODAL_EXAMPLE_MODELS = { "SmolVLMForConditionalGeneration": _HfExamplesInfo("HuggingFaceTB/SmolVLM2-2.2B-Instruct"), # noqa: E501 "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_5-llama-3_2-1b", # noqa: E501 trust_remote_code=True), + "TarsierForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier-7b", # noqa: E501 + hf_overrides={"architectures": ["TarsierForConditionalGeneration"]}), # noqa: E501 # [Encoder-decoder] # Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer # Therefore, we borrow the BartTokenizer from the original Bart model diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 8efd4825beea9..fcef457a78291 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -211,6 +211,7 @@ _MULTIMODAL_MODELS = { "Qwen2_5OmniForConditionalGeneration": ("qwen2_5_omni_thinker", "Qwen2_5OmniThinkerForConditionalGeneration"), # noqa: E501 "UltravoxModel": ("ultravox", "UltravoxModel"), "Phi4MMForCausalLM": ("phi4mm", "Phi4MMForCausalLM"), + "TarsierForConditionalGeneration": ("tarsier", "TarsierForConditionalGeneration"), # noqa: E501 # [Encoder-decoder] "Florence2ForConditionalGeneration": ("florence2", "Florence2ForConditionalGeneration"), # noqa: E501 "MllamaForConditionalGeneration": ("mllama", "MllamaForConditionalGeneration"), # noqa: E501 diff --git a/vllm/model_executor/models/tarsier.py b/vllm/model_executor/models/tarsier.py new file mode 100644 index 0000000000000..5aa3ddabc19ec --- /dev/null +++ b/vllm/model_executor/models/tarsier.py @@ -0,0 +1,643 @@ +# SPDX-License-Identifier: Apache-2.0 + +import math +from collections.abc import Iterable, Mapping, Sequence +from typing import (Final, Literal, Optional, Protocol, TypedDict, TypeVar, + Union, cast) + +import torch +import torch.nn as nn +from transformers import BatchFeature, CLIPVisionConfig +from transformers import LlavaConfig as HfLlavaConfig +from transformers import PretrainedConfig, SiglipVisionConfig +from transformers.image_utils import ImageInput, get_image_size, to_numpy_array +from transformers.models.llava import LlavaProcessor +from transformers.processing_utils import (ProcessingKwargs, Unpack, + _validate_images_text_input_order) +from transformers.tokenization_utils_base import PreTokenizedInput, TextInput + +from vllm.config import VllmConfig +from vllm.inputs import InputProcessingContext +from vllm.jsontree import json_map_leaves +from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.linear import (ColumnParallelLinear, + RowParallelLinear) +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.models.llava import LlavaDummyInputsBuilder +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs +from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, + ImageSize, MultiModalDataItems) +from vllm.multimodal.processing import (BaseMultiModalProcessor, + BaseProcessingInfo, ProcessingCache, + PromptReplacement, PromptUpdate) +from vllm.multimodal.profiling import BaseDummyInputsBuilder +from vllm.sequence import IntermediateTensors + +from .clip import CLIPVisionModel +from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP +from .siglip import SiglipVisionModel +from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model, + maybe_prefix, merge_multimodal_embeddings) +from .vision import VisionEncoderInfo, get_vision_encoder_info + + +class TarsierImagePixelInputs(TypedDict): + type: Literal["pixel_values"] + pixel_values: torch.Tensor + + +class TarsierImageEmbeddingInputs(TypedDict): + type: Literal["image_embeds"] + data: torch.Tensor + + +TarsierImageInputs = Union[TarsierImagePixelInputs, + TarsierImageEmbeddingInputs] + + +class TarsierHfConfig(Protocol): # Based on the Tarsier's LlavaConfig + vision_config: Final[PretrainedConfig] + text_config: Final[PretrainedConfig] # Added from Tarsier's LlavaConfig + image_token_index: Final[int] + vision_feature_select_strategy: Final[str] + vision_feature_layer: Final[Union[int, list[int]]] + projector_hidden_act: Final[str] + image_newline_idx: Final[int] + image_new_idx: Final[int] + multimodal_projector_bias: bool = True + + +class TarsierProcessorKwargs(ProcessingKwargs, total=False): + _defaults = { + "text_kwargs": { + "padding": False, + }, + "images_kwargs": {}, + } + + +class TarsierProcessor(LlavaProcessor): + + def __call__( + self, + images: ImageInput = None, + text: Union[TextInput, PreTokenizedInput, list[TextInput], + list[PreTokenizedInput]] = None, + audio=None, + videos=None, + **kwargs: Unpack[TarsierProcessorKwargs], + ) -> BatchFeature: + if images is None and text is None: + raise ValueError( + "You have to specify at least one of `images` or `text`.") + + # check if images and text inputs are reversed for BC + images, text = _validate_images_text_input_order(images, text) + + output_kwargs = self._merge_kwargs( + TarsierProcessorKwargs, + tokenizer_init_kwargs=self.tokenizer.init_kwargs, + **kwargs, + ) + if images is not None: + image_inputs = self.image_processor( + images, **output_kwargs["images_kwargs"]) + else: + image_inputs = {} + + if isinstance(text, str): + text = [text] + elif not isinstance(text, list) and not isinstance(text[0], str): + raise ValueError("Invalid input text. Please provide a string," + " or a list of strings") + + # try to expand inputs in processing if we have the necessary parts + prompt_strings = text + if image_inputs.get("pixel_values") is not None: + # Replace the image token with the expanded image token sequence + pixel_values = image_inputs["pixel_values"] + height, width = get_image_size(to_numpy_array(pixel_values[0])) + num_image_tokens = (height // self.patch_size) * ( + width // self.patch_size + + 1) + self.num_additional_image_tokens + 1 + if self.vision_feature_select_strategy == "default": + num_image_tokens -= 1 + + prompt_strings = [] + for sample in text: + sample = sample.replace(self.image_token, + self.image_token * num_image_tokens) + prompt_strings.append(sample) + + return_tensors = output_kwargs["text_kwargs"].pop( + "return_tensors", None) + text_inputs = self.tokenizer(prompt_strings, + **output_kwargs["text_kwargs"]) + return BatchFeature(data={ + **text_inputs, + **image_inputs + }, + tensor_type=return_tensors) + + +class TarsierMultiModalProjector(nn.Module): + + def __init__(self, + vision_hidden_size: int, + text_hidden_size: int, + projector_hidden_act: str, + multimodal_projector_bias: bool, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = ""): + super().__init__() + + self.linear_1 = ColumnParallelLinear(vision_hidden_size, + text_hidden_size, + bias=multimodal_projector_bias, + quant_config=quant_config, + prefix=f"{prefix}.linear_1") + self.act = get_act_fn(projector_hidden_act) + self.linear_2 = RowParallelLinear(text_hidden_size, + text_hidden_size, + bias=multimodal_projector_bias, + quant_config=quant_config, + prefix=f"{prefix}.linear_2") + + def forward(self, image_features: torch.Tensor) -> torch.Tensor: + hidden_states, _ = self.linear_1(image_features) + hidden_states = self.act(hidden_states) + hidden_states, _ = self.linear_2(hidden_states) + return hidden_states + + +class TarsierProcessingInfo(BaseProcessingInfo): + + def get_hf_config(self) -> TarsierHfConfig: + return self.ctx.get_hf_config(HfLlavaConfig) + + def get_vision_encoder_info(self) -> VisionEncoderInfo: + return get_vision_encoder_info(self.get_hf_config()) + + def get_hf_processor(self, **kwargs: object) -> TarsierProcessor: + hf_processor = self.ctx.get_hf_processor(TarsierProcessor, **kwargs) + # Patch for patch_size if needed (copied from vLLM LLaVA) + if hasattr(hf_processor, + 'patch_size') and hf_processor.patch_size is None: + patch_size = self.get_vision_encoder_info().get_patch_size() + hf_processor.patch_size = patch_size + return hf_processor + + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"image": None} + + def _apply_feature_select_strategy( + self, + strategy: str, + encoder_num_image_tokens: int, + ) -> int: + if strategy == "default": + return encoder_num_image_tokens - 1 + if strategy == "full": + return encoder_num_image_tokens + msg = f"Unexpected feature select strategy: {strategy!r}" + raise NotImplementedError(msg) + + def get_num_image_tokens( + self, + *, + image_width: int, + image_height: int, + ) -> int: + hf_config = self.get_hf_config() + vision_encoder_info = self.get_vision_encoder_info() + num_projected_patches = self._apply_feature_select_strategy( + hf_config.vision_feature_select_strategy, + vision_encoder_info.get_num_image_tokens( + image_width=image_width, + image_height=image_height, + ), + ) + if num_projected_patches <= 0: + default_size = self.get_image_size_with_most_features() + num_projected_patches_default = self._apply_feature_select_strategy( + hf_config.vision_feature_select_strategy, + vision_encoder_info.get_num_image_tokens( + image_width=default_size.width, + image_height=default_size.height, + ), + ) + if num_projected_patches_default <= 0: + raise ValueError( + "Could not determine a valid number of image patches.") + num_projected_patches = num_projected_patches_default + num_height_patches = int(math.sqrt(num_projected_patches)) + total_image_tokens_for_llm = num_projected_patches \ + + num_height_patches + 1 + return total_image_tokens_for_llm + + def get_image_size_with_most_features(self) -> ImageSize: + vision_encoder_info = self.get_vision_encoder_info() + width = height = vision_encoder_info.get_image_size() + return ImageSize(width=width, height=height) + + def get_max_image_tokens(self) -> int: + target_width, target_height = self.get_image_size_with_most_features() + return self.get_num_image_tokens( + image_width=target_width, + image_height=target_height, + ) + + def get_image_newline_idx(self) -> int: + return self.get_hf_config().image_newline_idx + + def get_image_new_idx(self) -> int: + return self.get_hf_config().image_new_idx + + +_I_Tarsier = TypeVar("_I_Tarsier", bound=TarsierProcessingInfo) + + +class TarsierDummyInputsBuilder(LlavaDummyInputsBuilder[_I_Tarsier]): + + pass + + +class TarsierMultiModalProcessor(BaseMultiModalProcessor[_I_Tarsier]): + + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict( + pixel_values=MultiModalFieldConfig.batched("image"), + image_embeds=MultiModalFieldConfig.batched("image"), + ) + + def _get_prompt_updates( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, + ) -> Sequence[PromptUpdate]: + hf_config = self.info.get_hf_config() + image_token_id = hf_config.image_token_index # The token ID + + def get_replacement(item_idx: int): + images = mm_items.get_items( + "image", (ImageEmbeddingItems, ImageProcessorItems)) + + if isinstance(images, ImageEmbeddingItems): + num_projected_patches = images.get_feature_size(item_idx) + # This assumes num_projected_patches is a perfect square + num_height_patches = int(math.sqrt(num_projected_patches)) + num_final_image_tokens = num_projected_patches \ + + num_height_patches + 1 + else: + image_size = images.get_image_size(item_idx) + num_final_image_tokens = self.info.get_num_image_tokens( + image_width=image_size.width, + image_height=image_size.height, + ) + + return [image_token_id] * num_final_image_tokens + + return [ + PromptReplacement( + modality="image", + target=[image_token_id], # Replace each single token + replacement=get_replacement, + ), + ] + + +def _build_tarsier_hf_info( + ctx: InputProcessingContext) -> TarsierProcessingInfo: + return TarsierProcessingInfo(ctx) + + +def _build_tarsier_hf_processor( + info: _I_Tarsier, + dummy_inputs: BaseDummyInputsBuilder[_I_Tarsier], + *, + cache: Optional[ProcessingCache] = None, +) -> BaseMultiModalProcessor: + if isinstance(info, TarsierProcessingInfo): + return TarsierMultiModalProcessor( + info, + dummy_inputs, + cache=cache, + ) + raise NotImplementedError(type(info)) + + +def init_vision_tower_for_tarsier( + hf_config: TarsierHfConfig, # Use the Tarsier specific config protocol + quant_config: Optional[QuantizationConfig], + *, + require_post_norm: Optional[bool] = None, + prefix: str = "", +) -> Union[CLIPVisionModel, SiglipVisionModel]: + vision_config = hf_config.vision_config + + feature_layers = hf_config.vision_feature_layer + base_num_hidden_layers = vision_config.num_hidden_layers + + def _get_layer_index(feature_layer_index: int, + num_hidden_layers_total: int) -> int: + if feature_layer_index < 0: + return num_hidden_layers_total + feature_layer_index + 1 + return feature_layer_index + + if isinstance(feature_layers, int): + num_hidden_layers_to_init = _get_layer_index(feature_layers, + base_num_hidden_layers) + elif isinstance(feature_layers, (list, tuple)): + num_hidden_layers_to_init = max( + _get_layer_index(idx, base_num_hidden_layers) + for idx in feature_layers) + else: + raise TypeError(f"vision_layer_feature type: {type(feature_layers)}" + " is not supported") + + if isinstance(vision_config, CLIPVisionConfig): + return CLIPVisionModel( + vision_config, + quant_config=quant_config, + num_hidden_layers_override=num_hidden_layers_to_init, + require_post_norm=require_post_norm, + prefix=prefix, + ) + elif isinstance(vision_config, SiglipVisionConfig): + return SiglipVisionModel( + vision_config, + quant_config=quant_config, + num_hidden_layers_override=num_hidden_layers_to_init, + require_post_norm=require_post_norm, + prefix=prefix, + ) + + msg = f"Unsupported vision config for Tarsier: {type(vision_config)}" + raise NotImplementedError(msg) + + +@MULTIMODAL_REGISTRY.register_processor(_build_tarsier_hf_processor, + info=_build_tarsier_hf_info, + dummy_inputs=TarsierDummyInputsBuilder) +class TarsierForConditionalGeneration(nn.Module, SupportsMultiModal, + SupportsPP): + packed_modules_mapping = { + "qkv_proj": ["q_proj", "k_proj", "v_proj"], + "gate_up_proj": ["gate_proj", "up_proj"] + } + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: + super().__init__() + config: TarsierHfConfig = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + self.config = config # Storing the Tarsier-specific HF config + self.vision_tower = init_vision_tower_for_tarsier( + config, + quant_config, + require_post_norm=False, + prefix=maybe_prefix(prefix, "vision_tower")) + projector_bias = getattr(config, "multimodal_projector_bias", True) + + self.multi_modal_projector = TarsierMultiModalProjector( + vision_hidden_size=config.vision_config.hidden_size, + text_hidden_size=config.text_config.hidden_size, + projector_hidden_act=config.projector_hidden_act, + multimodal_projector_bias=projector_bias, + quant_config=quant_config, + prefix=maybe_prefix(prefix, "multi_modal_projector")) + self.language_model = init_vllm_registered_model( + vllm_config=vllm_config, + hf_config=config. + text_config, # Use text_config from Tarsier's main config + prefix=maybe_prefix(prefix, "language_model"), + ) + self.register_buffer('image_newline_idx_tensor', + torch.tensor([config.image_newline_idx], + dtype=torch.long), + persistent=False) + self.register_buffer('image_new_idx_tensor', + torch.tensor([config.image_new_idx], + dtype=torch.long), + persistent=False) + + self.make_empty_intermediate_tensors = ( + self.language_model.make_empty_intermediate_tensors) + + def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor: + h = w = self.config.vision_config.image_size + expected_dims = (3, h, w) # Assuming 3 channels + actual_dims = tuple(data.shape[1:]) + + if actual_dims != expected_dims: + expected_expr = ("batch_size", *map(str, expected_dims)) + raise ValueError( + f"The expected shape of pixel values is {expected_expr}. " + f"You supplied {tuple(data.shape)}.") + return data + + def _parse_and_validate_image_input( + self, **kwargs: object) -> Optional[TarsierImageInputs]: + pixel_values = kwargs.pop("pixel_values", None) + image_embeds = kwargs.pop("image_embeds", None) + + if pixel_values is None and image_embeds is None: + return None + + if pixel_values is not None: + if not isinstance(pixel_values, (torch.Tensor, list)): + raise ValueError("Incorrect type of pixel values. " + f"Got type: {type(pixel_values)}") + + return TarsierImagePixelInputs( + type="pixel_values", + pixel_values=self._validate_pixel_values( + flatten_bn(pixel_values, concat=True)), + ) + + if image_embeds is not None: + if not isinstance(image_embeds, (torch.Tensor, list)): + raise ValueError("Incorrect type of image embeddings. " + f"Got type: {type(image_embeds)}") + return TarsierImageEmbeddingInputs( + type="image_embeds", + data=flatten_bn(image_embeds, concat=True), + ) + + raise AssertionError("This line should be unreachable.") + + def _select_image_features(self, image_features: torch.Tensor, *, + strategy: str) -> torch.Tensor: + if strategy == "default": + return image_features[:, 1:] + elif strategy == "full": + return image_features + raise ValueError(f"Unexpected select feature strategy: {strategy}") + + def _image_pixels_to_features( + self, + vision_tower: Union[CLIPVisionModel, SiglipVisionModel], + pixel_values: Union[torch.Tensor, list[torch.Tensor]], + ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]: + # From vLLM LLaVA, vision tower output handling + image_hidden_states = vision_tower(pixel_values) + if not isinstance(image_hidden_states, torch.Tensor): + raise TypeError( + f"image_hidden_states type: {type(image_hidden_states)}" + " is not supported") + + def select_features_fn(leaf: torch.Tensor): + return self._select_image_features( + leaf, + strategy=self.config.vision_feature_select_strategy, + ) + + selected_features = cast( + Union[torch.Tensor, tuple[torch.Tensor, ...]], + json_map_leaves(select_features_fn, image_hidden_states), + ) + return selected_features + + def _add_tarsier_split_tokens( + self, projected_image_features: torch.Tensor) -> torch.Tensor: + """ + Implements Tarsier's `add_split_tokens` logic. + """ + num_images, num_projected_patches, embed_dim = \ + projected_image_features.shape + num_height_patches = int(math.sqrt(num_projected_patches)) + num_width_patches = num_projected_patches // num_height_patches + device = projected_image_features.device + embedding_layer = self.language_model.model.embed_tokens + image_newline_emb = embedding_layer( + self.image_newline_idx_tensor.to(device)).squeeze(0) + image_new_emb = embedding_layer( + self.image_new_idx_tensor.to(device)).squeeze(0) + try: + current_image_features_grid = projected_image_features.view( + num_images, num_height_patches, num_width_patches, embed_dim) + except RuntimeError as e: + raise RuntimeError( + "Cannot reshape projected_image_features" + f" with shape {projected_image_features.shape} " + f"to ({num_images}, {num_height_patches}," + f" {num_width_patches}, {embed_dim}). " + "Ensure num_projected_patches is compatible" + " with a grid structure. " + f"num_projected_patches={num_projected_patches}, " + f"derived num_height_patches={num_height_patches}. ") from e + + image_newline_expanded = image_newline_emb.expand( + (num_images, num_height_patches, 1, embed_dim)) + features_with_newlines = torch.cat( + [current_image_features_grid, image_newline_expanded], + dim=2 # Concatenate along width dim + ) + new_num_patches_after_newline = num_projected_patches \ + + num_height_patches + features_with_newlines_flat = features_with_newlines.view( + num_images, new_num_patches_after_newline, embed_dim) + image_new_expanded = image_new_emb.expand((num_images, 1, embed_dim)) + final_image_features = torch.cat( + [features_with_newlines_flat, image_new_expanded], + dim=1 # Concatenate along patch sequence dim + ) + return final_image_features + + def _process_image_pixels( + self, + inputs: TarsierImagePixelInputs, + ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]: + assert self.vision_tower is not None + pixel_values = inputs["pixel_values"] + image_features_selected = self._image_pixels_to_features( + self.vision_tower, pixel_values) # type: ignore + if isinstance(image_features_selected, torch.Tensor): + projected_features = self.multi_modal_projector( + image_features_selected) + final_features = self._add_tarsier_split_tokens(projected_features) + return final_features + else: + raise TypeError( + f"_image_pixels_to_features type:" + f" {type(image_features_selected)} is not supported") + + def _process_image_input( + self, + image_input: TarsierImageInputs, + ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]: + if image_input["type"] == "image_embeds": + projected_features = image_input["data"] + if isinstance(projected_features, torch.Tensor): + return self._add_tarsier_split_tokens(projected_features) + else: + raise ValueError("Incorrect type of image_embeds. " + f"Got type: {type(projected_features)}. ") + assert self.vision_tower is not None + return self._process_image_pixels(image_input) + + def get_language_model(self) -> torch.nn.Module: + return self.language_model + + def get_multimodal_embeddings( + self, **kwargs: object) -> Optional[MultiModalEmbeddings]: + image_input = self._parse_and_validate_image_input(**kwargs) + if image_input is None: + return None + return self._process_image_input(image_input) + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + multimodal_embeddings: Optional[MultiModalEmbeddings] = None, + ) -> torch.Tensor: + inputs_embeds = self.language_model.get_input_embeddings(input_ids) + if multimodal_embeddings is not None: + inputs_embeds = merge_multimodal_embeddings( + input_ids, + inputs_embeds, + multimodal_embeddings, + self.config.image_token_index, + ) + return inputs_embeds + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + **kwargs: object, + ) -> Union[torch.Tensor, IntermediateTensors]: + if intermediate_tensors is not None: + inputs_embeds = None + elif inputs_embeds is None: + vision_embeddings = self.get_multimodal_embeddings(**kwargs) + inputs_embeds = self.get_input_embeddings(input_ids, + vision_embeddings) + input_ids = None + hidden_states = self.language_model.model( + input_ids=input_ids, + positions=positions, + intermediate_tensors=intermediate_tensors, + inputs_embeds=inputs_embeds) + return hidden_states + + def compute_logits( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[torch.Tensor]: + return self.language_model.compute_logits(hidden_states, + sampling_metadata) + + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + loader = AutoWeightsLoader(self) + return loader.load_weights(weights) From 17430e36531aeade52518b13961706d4227310f9 Mon Sep 17 00:00:00 2001 From: Reid <61492567+reidliu41@users.noreply.github.com> Date: Tue, 3 Jun 2025 13:35:12 +0800 Subject: [PATCH 010/115] [bugfix] small fix logic issue (#18999) Signed-off-by: reidliu41 Co-authored-by: reidliu41 --- vllm/engine/arg_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index a5b155024b73a..e3b8a18ccdfef 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -455,7 +455,7 @@ class EngineArgs: title="ModelConfig", description=ModelConfig.__doc__, ) - if 'serve' not in sys.argv[1:] and '--help' not in sys.argv[1:]: + if not ('serve' in sys.argv[1:] and '--help' in sys.argv[1:]): model_group.add_argument("--model", **model_kwargs["model"]) model_group.add_argument("--task", **model_kwargs["task"]) model_group.add_argument("--tokenizer", **model_kwargs["tokenizer"]) From cc977286e7a4350183aeef873858fe0dc6774740 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Tue, 3 Jun 2025 02:00:45 -0400 Subject: [PATCH 011/115] Reduce logs in CLI scripts and plugin loader (#18970) Signed-off-by: mgoin --- vllm/benchmarks/datasets.py | 6 +++--- vllm/benchmarks/latency.py | 2 -- vllm/benchmarks/throughput.py | 1 - vllm/compilation/backends.py | 6 +++--- vllm/plugins/__init__.py | 19 +++++++++++++------ 5 files changed, 19 insertions(+), 15 deletions(-) diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index 35cc303f60eeb..21fe3eb629e21 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -328,9 +328,9 @@ class RandomDataset(BenchmarkDataset): output_high = int(output_len * (1 + range_ratio)) # Add logging for debugging - logger.info("Sampling input_len from [%s, %s]", input_low, input_high) - logger.info("Sampling output_len from [%s, %s]", output_low, - output_high) + logger.info( + "Sampling input_len from [%s, %s] and output_len from [%s, %s]", + input_low, input_high, output_low, output_high) input_lens = np.random.randint(input_low, input_high + 1, diff --git a/vllm/benchmarks/latency.py b/vllm/benchmarks/latency.py index c9e03cc3bf781..dc1c42879b2cf 100644 --- a/vllm/benchmarks/latency.py +++ b/vllm/benchmarks/latency.py @@ -78,7 +78,6 @@ def add_cli_args(parser: argparse.ArgumentParser): def main(args: argparse.Namespace): - print(args) if args.profile and not envs.VLLM_TORCH_PROFILER_DIR: raise OSError( "The environment variable 'VLLM_TORCH_PROFILER_DIR' is not set. " @@ -101,7 +100,6 @@ def main(args: argparse.Namespace): max_tokens=args.output_len, detokenize=not args.disable_detokenize, ) - print(sampling_params) dummy_prompt_token_ids = np.random.randint(10000, size=(args.batch_size, args.input_len)) diff --git a/vllm/benchmarks/throughput.py b/vllm/benchmarks/throughput.py index 13110a8b4db3f..3ea6c194baa8a 100644 --- a/vllm/benchmarks/throughput.py +++ b/vllm/benchmarks/throughput.py @@ -527,7 +527,6 @@ def main(args: argparse.Namespace): validate_args(args) if args.seed is None: args.seed = 0 - print(args) random.seed(args.seed) # Sample the requests. tokenizer = AutoTokenizer.from_pretrained( diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index b724479a95dee..c4bfffe929970 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -31,13 +31,13 @@ def make_compiler(compilation_config: CompilationConfig) -> CompilerInterface: if compilation_config.use_inductor: if envs.VLLM_USE_STANDALONE_COMPILE and is_torch_equal_or_newer( "2.8.0"): - logger.info("Using InductorStandaloneAdaptor") + logger.debug("Using InductorStandaloneAdaptor") return InductorStandaloneAdaptor() else: - logger.info("Using InductorAdaptor") + logger.debug("Using InductorAdaptor") return InductorAdaptor() else: - logger.info("Using EagerAdaptor") + logger.debug("Using EagerAdaptor") return EagerAdaptor() diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py index 2884cb46fecd7..4cd3552f8a552 100644 --- a/vllm/plugins/__init__.py +++ b/vllm/plugins/__init__.py @@ -10,6 +10,8 @@ import vllm.envs as envs logger = logging.getLogger(__name__) +DEFAULT_PLUGINS_GROUP = 'vllm.general_plugins' + # make sure one process only loads plugins once plugins_loaded = False @@ -28,19 +30,24 @@ def load_plugins_by_group(group: str) -> dict[str, Callable[[], Any]]: logger.debug("No plugins for group %s found.", group) return {} - logger.info("Available plugins for group %s:", group) + # Check if the only discovered plugin is the default one + is_default_group = (group == DEFAULT_PLUGINS_GROUP) + # Use INFO for non-default groups and DEBUG for the default group + log_level = logger.debug if is_default_group else logger.info + + log_level("Available plugins for group %s:", group) for plugin in discovered_plugins: - logger.info("- %s -> %s", plugin.name, plugin.value) + log_level("- %s -> %s", plugin.name, plugin.value) if allowed_plugins is None: - logger.info("All plugins in this group will be loaded. " - "Set `VLLM_PLUGINS` to control which plugins to load.") + log_level("All plugins in this group will be loaded. " + "Set `VLLM_PLUGINS` to control which plugins to load.") plugins = dict[str, Callable[[], Any]]() for plugin in discovered_plugins: if allowed_plugins is None or plugin.name in allowed_plugins: if allowed_plugins is not None: - logger.info("Loading plugin %s", plugin.name) + log_level("Loading plugin %s", plugin.name) try: func = plugin.load() @@ -80,7 +87,7 @@ def load_general_plugins(): # see https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html # noqa: E501 os.environ['PT_HPU_ENABLE_LAZY_COLLECTIVES'] = 'true' - plugins = load_plugins_by_group(group='vllm.general_plugins') + plugins = load_plugins_by_group(group=DEFAULT_PLUGINS_GROUP) # general plugins, we only need to execute the loaded functions for func in plugins.values(): func() From d32aa2e67002afe936b8d2cadffd8adc7aaf48e7 Mon Sep 17 00:00:00 2001 From: Lu Fang <30275821+houseroad@users.noreply.github.com> Date: Tue, 3 Jun 2025 15:16:17 +0800 Subject: [PATCH 012/115] [Bugfix] Use cmake 3.26.1 instead of 3.26 to avoid build failure (#19019) Signed-off-by: Lu Fang --- docker/Dockerfile.neuron | 2 +- docs/getting_started/installation/cpu/build.inc.md | 2 +- pyproject.toml | 2 +- requirements/build.txt | 2 +- requirements/rocm-build.txt | 2 +- requirements/tpu.txt | 2 +- requirements/xpu.txt | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/docker/Dockerfile.neuron b/docker/Dockerfile.neuron index 259dc5a23f78b..8bc23554718dc 100644 --- a/docker/Dockerfile.neuron +++ b/docker/Dockerfile.neuron @@ -34,7 +34,7 @@ RUN --mount=type=bind,source=.git,target=.git \ if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi RUN python3 -m pip install -U \ - 'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \ + 'cmake>=3.26.1' ninja packaging 'setuptools-scm>=8' wheel jinja2 \ -r requirements/neuron.txt ENV VLLM_TARGET_DEVICE neuron diff --git a/docs/getting_started/installation/cpu/build.inc.md b/docs/getting_started/installation/cpu/build.inc.md index 7d6472afa7ea7..7ddadccb1b4f1 100644 --- a/docs/getting_started/installation/cpu/build.inc.md +++ b/docs/getting_started/installation/cpu/build.inc.md @@ -17,7 +17,7 @@ Third, install Python packages for vLLM CPU backend building: ```console pip install --upgrade pip -pip install "cmake>=3.26" wheel packaging ninja "setuptools-scm>=8" numpy +pip install "cmake>=3.26.1" wheel packaging ninja "setuptools-scm>=8" numpy pip install -v -r requirements/cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu ``` diff --git a/pyproject.toml b/pyproject.toml index 10f5dbeae6851..307878f7e38d6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [build-system] # Should be mirrored in requirements/build.txt requires = [ - "cmake>=3.26", + "cmake>=3.26.1", "ninja", "packaging>=24.2", "setuptools>=77.0.3,<80.0.0", diff --git a/requirements/build.txt b/requirements/build.txt index 320e5b8925843..528cd3b538efd 100644 --- a/requirements/build.txt +++ b/requirements/build.txt @@ -1,5 +1,5 @@ # Should be mirrored in pyproject.toml -cmake>=3.26 +cmake>=3.26.1 ninja packaging>=24.2 setuptools>=77.0.3,<80.0.0 diff --git a/requirements/rocm-build.txt b/requirements/rocm-build.txt index 981b90632c182..94201543cd4f3 100644 --- a/requirements/rocm-build.txt +++ b/requirements/rocm-build.txt @@ -7,7 +7,7 @@ torchvision==0.22.0 torchaudio==2.7.0 triton==3.2 -cmake>=3.26,<4 +cmake>=3.26.1,<4 packaging>=24.2 setuptools>=77.0.3,<80.0.0 setuptools-scm>=8 diff --git a/requirements/tpu.txt b/requirements/tpu.txt index edc8b2a456670..47e638463bf58 100644 --- a/requirements/tpu.txt +++ b/requirements/tpu.txt @@ -2,7 +2,7 @@ -r common.txt # Dependencies for TPU -cmake>=3.26 +cmake>=3.26.1 packaging>=24.2 setuptools-scm>=8 wheel diff --git a/requirements/xpu.txt b/requirements/xpu.txt index 04c4d4ff85a0d..3cb6a4a8addac 100644 --- a/requirements/xpu.txt +++ b/requirements/xpu.txt @@ -2,7 +2,7 @@ -r common.txt ray>=2.9 -cmake>=3.26 +cmake>=3.26.1 packaging>=24.2 setuptools-scm>=8 setuptools>=77.0.3,<80.0.0 From f32fcd944430603ebcbbf04454b4e15754168ef4 Mon Sep 17 00:00:00 2001 From: Chen Zhang Date: Tue, 3 Jun 2025 16:01:48 +0800 Subject: [PATCH 013/115] [v1][KVCacheManager] Rename BlockHashType to BlockHash (#19015) Signed-off-by: Chen Zhang --- docs/design/v1/prefix_caching.md | 2 +- tests/v1/core/test_kv_cache_utils.py | 12 +++++------- tests/v1/core/test_prefix_caching.py | 4 ++-- tests/v1/core/test_specialized_manager.py | 4 ++-- vllm/v1/core/block_pool.py | 8 ++++---- vllm/v1/core/kv_cache_manager.py | 4 ++-- vllm/v1/core/kv_cache_utils.py | 14 +++++++------- vllm/v1/core/single_type_kv_cache_manager.py | 10 +++++----- 8 files changed, 28 insertions(+), 30 deletions(-) diff --git a/docs/design/v1/prefix_caching.md b/docs/design/v1/prefix_caching.md index ad041b0059f58..bbdfb255214dd 100644 --- a/docs/design/v1/prefix_caching.md +++ b/docs/design/v1/prefix_caching.md @@ -104,7 +104,7 @@ class KVCacheBlock: block_id: int # The block hash (will be assigned when the block is full, # and will be reset when the block is evicted). - block_hash: BlockHashType + block_hash: BlockHash # The number of requests using this block now. ref_cnt: int diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py index d3d62cf09232d..61aee87529884 100644 --- a/tests/v1/core/test_kv_cache_utils.py +++ b/tests/v1/core/test_kv_cache_utils.py @@ -100,8 +100,8 @@ def test_kv_cache_block(): assert block.ref_cnt == 0 # Test block hash setting and resetting - block_hash = vllm.v1.core.kv_cache_utils.BlockHashType(hash_value=123, - token_ids=(1, 2, 3)) + block_hash = vllm.v1.core.kv_cache_utils.BlockHash(hash_value=123, + token_ids=(1, 2, 3)) block.block_hash = block_hash assert block.block_hash == block_hash @@ -282,7 +282,7 @@ def test_hash_block_tokens(hash_fn): block_hash = hash_block_tokens(hash_fn, parent_block_hash, curr_block_token_ids, extra_keys) - assert isinstance(block_hash, vllm.v1.core.kv_cache_utils.BlockHashType) + assert isinstance(block_hash, vllm.v1.core.kv_cache_utils.BlockHash) assert block_hash.hash_value == hash_fn( (parent_block_hash, curr_block_token_ids, extra_keys)) assert block_hash.token_ids == curr_block_token_ids @@ -306,10 +306,8 @@ def test_hash_request_tokens(hash_fn): block_hashes = hash_request_tokens(hash_fn, block_size, request) assert len(block_hashes) == 2 - assert isinstance(block_hashes[0], - vllm.v1.core.kv_cache_utils.BlockHashType) - assert isinstance(block_hashes[1], - vllm.v1.core.kv_cache_utils.BlockHashType) + assert isinstance(block_hashes[0], vllm.v1.core.kv_cache_utils.BlockHash) + assert isinstance(block_hashes[1], vllm.v1.core.kv_cache_utils.BlockHash) # Check the first block assert block_hashes[0].token_ids == (0, 1, 2) diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py index ba3c0b3cf3169..1a7a31d98506c 100644 --- a/tests/v1/core/test_prefix_caching.py +++ b/tests/v1/core/test_prefix_caching.py @@ -12,7 +12,7 @@ from vllm.sampling_params import SamplingParams from vllm.utils import sha256 from vllm.v1.core.block_pool import BlockPool from vllm.v1.core.kv_cache_manager import KVCacheManager, Request -from vllm.v1.core.kv_cache_utils import (BlockHashType, KVCacheBlock, +from vllm.v1.core.kv_cache_utils import (BlockHash, KVCacheBlock, hash_block_tokens) from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, KVCacheGroupSpec, SlidingWindowSpec) @@ -547,7 +547,7 @@ def test_cache_blocks(hash_fn): # Test that blocks are cached correctly for 2 full blocks from the start. blocks = [KVCacheBlock(block_id=i) for i in range(2)] - block_hashes: list[BlockHashType] = [] + block_hashes: list[BlockHash] = [] block_pool.cache_full_blocks( request=req, diff --git a/tests/v1/core/test_specialized_manager.py b/tests/v1/core/test_specialized_manager.py index 101a2379be377..4217dc37e2df9 100644 --- a/tests/v1/core/test_specialized_manager.py +++ b/tests/v1/core/test_specialized_manager.py @@ -3,7 +3,7 @@ import torch from vllm.v1.core.block_pool import BlockPool -from vllm.v1.core.kv_cache_utils import BlockHashType, KVCacheBlock +from vllm.v1.core.kv_cache_utils import BlockHash, KVCacheBlock from vllm.v1.core.single_type_kv_cache_manager import SlidingWindowManager from vllm.v1.kv_cache_interface import SlidingWindowSpec @@ -32,7 +32,7 @@ def test_sliding_window_possible_cached_prefix(): def run_one_case(block_is_cached, expect_length): block_hash_list = [ - BlockHashType(i, ()) for i in range(len(block_is_cached)) + BlockHash(i, ()) for i in range(len(block_is_cached)) ] block_pool.cached_block_hash_to_block.clear() diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py index f2ed183b68fc8..a0a065df9b1ca 100644 --- a/vllm/v1/core/block_pool.py +++ b/vllm/v1/core/block_pool.py @@ -6,7 +6,7 @@ from typing import Callable, Optional from vllm.distributed.kv_events import (AllBlocksCleared, BlockRemoved, BlockStored, KVCacheEvent) from vllm.logger import init_logger -from vllm.v1.core.kv_cache_utils import (BlockHashType, FreeKVCacheBlockQueue, +from vllm.v1.core.kv_cache_utils import (BlockHash, FreeKVCacheBlockQueue, KVCacheBlock, generate_block_hash_extra_keys, hash_block_tokens) @@ -55,7 +55,7 @@ class BlockPool: # if there is already an identical block in the cache. This is because # we want to make sure the allocated block IDs won't change so that # block tables are append-only. - self.cached_block_hash_to_block: dict[BlockHashType, dict[ + self.cached_block_hash_to_block: dict[BlockHash, dict[ int, KVCacheBlock]] = defaultdict(dict) # To represent a placeholder block with block_id=0. @@ -67,7 +67,7 @@ class BlockPool: self.kv_event_queue: list[KVCacheEvent] = [] def get_cached_block(self, - block_hash: BlockHashType) -> Optional[KVCacheBlock]: + block_hash: BlockHash) -> Optional[KVCacheBlock]: """Get a cached block by the block hash, or None if cache miss. If there are duplicated blocks, we return the first block in the cache. @@ -87,7 +87,7 @@ class BlockPool: self, request: Request, blocks: list[KVCacheBlock], - block_hashes: list[BlockHashType], + block_hashes: list[BlockHash], num_cached_blocks: int, num_full_blocks: int, block_size: int, diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 0f6098d2b4005..59e07382b652f 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -8,7 +8,7 @@ from vllm.distributed.kv_events import KVCacheEvent from vllm.logger import init_logger from vllm.utils import sha256 from vllm.v1.core.block_pool import BlockPool -from vllm.v1.core.kv_cache_utils import (BlockHashType, KVCacheBlock, +from vllm.v1.core.kv_cache_utils import (BlockHash, KVCacheBlock, hash_request_tokens) from vllm.v1.core.single_type_kv_cache_manager import ( get_manager_for_kv_cache_spec) @@ -92,7 +92,7 @@ class KVCacheManager: # This is to avoid recomputing the block hashes for each call of # `get_computed_blocks` or `allocate_slots`. self.req_to_block_hashes: defaultdict[ - str, list[BlockHashType]] = defaultdict(list) + str, list[BlockHash]] = defaultdict(list) @property def usage(self) -> float: diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index a41fe48818702..3ccad97e9919b 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -18,7 +18,7 @@ from vllm.v1.request import Request logger = init_logger(__name__) -class BlockHashType(NamedTuple): +class BlockHash(NamedTuple): """Hash value of a block (int), the token IDs in the block, and extra keys. We keep a tuple of token IDs and extra keys to reduce the likelihood of hash collisions when the hash value is the same. By using SHA256 however, @@ -117,7 +117,7 @@ class KVCacheBlock: ref_cnt: int = 0 # The hash of the block composed of (block hash, tuple of token IDs). # It is only available when the block is full. - _block_hash: Optional[BlockHashType] = None + _block_hash: Optional[BlockHash] = None # Used to construct a doubly linked list for free blocks. # These two attributes should only be manipulated by FreeKVCacheBlockQueue. @@ -131,11 +131,11 @@ class KVCacheBlock: self.ref_cnt -= 1 @property - def block_hash(self) -> Optional[BlockHashType]: + def block_hash(self) -> Optional[BlockHash]: return self._block_hash @block_hash.setter - def block_hash(self, block_hash: BlockHashType): + def block_hash(self, block_hash: BlockHash): assert self.block_hash is None, ( "The block already has a hash. This should not happen.") self._block_hash = block_hash @@ -398,7 +398,7 @@ def hash_block_tokens( hash_function: Callable, parent_block_hash: Optional[int], curr_block_token_ids: Sequence[int], - extra_keys: Optional[tuple[Any, ...]] = None) -> BlockHashType: + extra_keys: Optional[tuple[Any, ...]] = None) -> BlockHash: """Computes a hash value corresponding to the contents of a block and the contents of the preceding block(s). The hash value is used for prefix caching. We use LRU cache for this function to avoid recomputing @@ -419,14 +419,14 @@ def hash_block_tokens( parent_block_hash = NONE_HASH curr_block_token_ids_tuple = tuple(curr_block_token_ids) - return BlockHashType( + return BlockHash( hash_function( (parent_block_hash, curr_block_token_ids_tuple, extra_keys)), curr_block_token_ids_tuple, extra_keys) def hash_request_tokens(hash_function: Any, block_size: int, - request: Request) -> list[BlockHashType]: + request: Request) -> list[BlockHash]: """Computes hash values of a chain of blocks given a sequence of token IDs. The hash value is used for prefix caching. diff --git a/vllm/v1/core/single_type_kv_cache_manager.py b/vllm/v1/core/single_type_kv_cache_manager.py index 0223c9ceec8de..e69e9ac9f6a37 100644 --- a/vllm/v1/core/single_type_kv_cache_manager.py +++ b/vllm/v1/core/single_type_kv_cache_manager.py @@ -5,7 +5,7 @@ from typing import Callable from vllm.utils import cdiv from vllm.v1.core.block_pool import BlockPool -from vllm.v1.core.kv_cache_utils import BlockHashType, KVCacheBlock +from vllm.v1.core.kv_cache_utils import BlockHash, KVCacheBlock from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheSpec, SlidingWindowSpec) from vllm.v1.request import Request @@ -133,7 +133,7 @@ class SingleTypeKVCacheManager(ABC): req_blocks.extend(new_blocks) return new_blocks - def cache_blocks(self, request: Request, block_hashes: list[BlockHashType], + def cache_blocks(self, request: Request, block_hashes: list[BlockHash], num_tokens: int) -> None: """ Cache the blocks for the request. @@ -187,7 +187,7 @@ class SingleTypeKVCacheManager(ABC): raise NotImplementedError @abstractmethod - def find_longest_cache_hit(self, block_hashes: list[BlockHashType], + def find_longest_cache_hit(self, block_hashes: list[BlockHash], max_length: int) -> list[KVCacheBlock]: """ Get the longest cache hit prefix of the blocks that is not longer than @@ -228,7 +228,7 @@ class SingleTypeKVCacheManager(ABC): class FullAttentionManager(SingleTypeKVCacheManager): - def find_longest_cache_hit(self, block_hashes: list[BlockHashType], + def find_longest_cache_hit(self, block_hashes: list[BlockHash], max_length: int) -> list[KVCacheBlock]: computed_blocks: list[KVCacheBlock] = [] max_num_blocks = max_length // self.block_size @@ -280,7 +280,7 @@ class SlidingWindowManager(SingleTypeKVCacheManager): self.sliding_window_contiguous_blocks += 1 self._null_block = block_pool.null_block - def find_longest_cache_hit(self, block_hashes: list[BlockHashType], + def find_longest_cache_hit(self, block_hashes: list[BlockHash], max_length: int) -> list[KVCacheBlock]: # TODO: reduce i by sliding_window_contiguous_blocks when cache miss, to # optimize the time complexity from O(max_num_blocks) to From 6d18ed2a2e858a8061dfe8c2e140c2c498d6a99a Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Tue, 3 Jun 2025 04:21:53 -0400 Subject: [PATCH 014/115] Update docker docs with ARM CUDA cross-compile (#19037) Signed-off-by: mgoin --- docs/deployment/docker.md | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/docs/deployment/docker.md b/docs/deployment/docker.md index 516640f6fd3c4..9e506d3d7ba38 100644 --- a/docs/deployment/docker.md +++ b/docs/deployment/docker.md @@ -107,10 +107,21 @@ DOCKER_BUILDKIT=1 docker build . \ -t vllm/vllm-gh200-openai:latest \ --build-arg max_jobs=66 \ --build-arg nvcc_threads=2 \ - --build-arg torch_cuda_arch_list="9.0+PTX" \ + --build-arg torch_cuda_arch_list="9.0 10.0+PTX" \ --build-arg vllm_fa_cmake_gpu_arches="90-real" ``` +!!! note + If you are building the `linux/arm64` image on a non-ARM host (e.g., an x86_64 machine), you need to ensure your system is set up for cross-compilation using QEMU. This allows your host machine to emulate ARM64 execution. + + Run the following command on your host machine to register QEMU user static handlers: + + ```console + docker run --rm --privileged multiarch/qemu-user-static --reset -p yes + ``` + + After setting up QEMU, you can use the `--platform "linux/arm64"` flag in your `docker build` command. + ## Use the custom-built vLLM Docker image To run vLLM with the custom-built Docker image: From 42243fbda04d908aa16f17bf3d5f9cf35e4ef26f Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Tue, 3 Jun 2025 17:08:03 +0800 Subject: [PATCH 015/115] [Doc] Add InternVL LoRA support (#19055) Signed-off-by: Jee Jee Li --- docs/models/supported_models.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index f2090fe3971e9..71414d2aad821 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -524,7 +524,7 @@ Specified using `--task generate`. | `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | ✅︎ | | `H2OVLChatModel` | H2OVL | T + IE+ | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎\* | | `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3` etc. | ✅︎ | | ✅︎ | -| `InternVLChatModel` | InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + IE+ + (VE+) | `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | | ✅︎ | ✅︎ | +| `InternVLChatModel` | InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + IE+ + (VE+) | `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | ✅︎ | | `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I+ | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | | ✅︎ | | `Llama4ForConditionalGeneration` | Llama 4 | T + I+ | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | | ✅︎ | ✅︎ | | `LlavaForConditionalGeneration` | LLaVA-1.5 | T + IE+ | `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), etc. | | ✅︎ | ✅︎ | From ec2dcd80bc173c06a4c48377d4a6b6ca2c78a2f5 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Tue, 3 Jun 2025 17:08:20 +0800 Subject: [PATCH 016/115] [Misc] Update `WeightsMapper` for qwen2-vl/qwen2.5-vl (#19054) Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm/model_executor/models/qwen2_5_vl.py | 13 +++++++++---- vllm/model_executor/models/qwen2_vl.py | 13 +++++++++---- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index e3fa9f67ca078..f62c7e1d2ee16 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -823,10 +823,15 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP): # To ensure correct weight loading and mapping. - hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={ - "lm_head.": "language_model.lm_head.", - "model.": "language_model.model.", - }) + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_prefix={ + # mapping for new names in checkpoint saved after transformers v4.52 + "model.language_model.": "language_model.model.", + "model.visual.": "visual.", + # mapping for original checkpoint + "lm_head.": "language_model.lm_head.", + "model.": "language_model.model.", + }) def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 873baa56faf37..5c30e36c7ce3a 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -1071,10 +1071,15 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP): # To ensure correct weight loading and mapping. - hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={ - "lm_head.": "language_model.lm_head.", - "model.": "language_model.model.", - }) + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_prefix={ + # mapping for new names in checkpoint saved after transformers v4.52 + "model.language_model.": "language_model.model.", + "model.visual.": "visual.", + # mapping for original checkpoint + "lm_head.": "language_model.lm_head.", + "model.": "language_model.model.", + }) def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() From 118ff921118cc81061a2af865a1e13840ceb6792 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Tue, 3 Jun 2025 17:29:41 +0800 Subject: [PATCH 017/115] [Doc] Update V1 user guide for embedding and enc-dec models (#19060) Signed-off-by: DarkLight1337 --- docs/usage/v1_guide.md | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/docs/usage/v1_guide.md b/docs/usage/v1_guide.md index 3d5d7ce45cce4..a2321bf98900b 100644 --- a/docs/usage/v1_guide.md +++ b/docs/usage/v1_guide.md @@ -1,5 +1,7 @@ # vLLM V1 +**We have started the process of deprecating V0. Please read [RFC #18571](https://github.com/vllm-project/vllm/issues/18571) for more details.** + V1 is now enabled by default for all supported use cases, and we will gradually enable it for every use case we plan to support. Please share any feedback on [GitHub](https://github.com/vllm-project/vllm) or in the [vLLM Slack](https://inviter.co/vllm-slack). To disable V1, please set the environment variable as: `VLLM_USE_V1=0`, and send us a GitHub issue sharing the reason! @@ -51,9 +53,9 @@ This living user guide outlines a few known **important changes and limitations* | **Spec Decode** | 🚧 WIP ([PR #13933](https://github.com/vllm-project/vllm/pull/13933))| | **Prompt Logprobs with Prefix Caching** | 🟡 Planned ([RFC #13414](https://github.com/vllm-project/vllm/issues/13414))| | **Structured Output Alternative Backends** | 🟡 Planned | -| **Embedding Models** | 🟡 Planned ([RFC #12249](https://github.com/vllm-project/vllm/issues/12249)) | +| **Embedding Models** | 🚧 WIP ([PR #18015](https://github.com/vllm-project/vllm/pull/18015)) | | **Mamba Models** | 🟡 Planned | -| **Encoder-Decoder Models** | 🟡 Planned | +| **Encoder-Decoder Models** | 🟠 Delayed | | **Request-level Structured Output Backend** | 🔴 Deprecated | | **best_of** | 🔴 Deprecated ([RFC #13361](https://github.com/vllm-project/vllm/issues/13361))| | **Per-Request Logits Processors** | 🔴 Deprecated ([RFC #13360](https://github.com/vllm-project/vllm/pull/13360)) | @@ -63,10 +65,11 @@ This living user guide outlines a few known **important changes and limitations* - **🟢 Functional**: Fully operational, with ongoing optimizations. - **🚧 WIP**: Under active development. - **🟡 Planned**: Scheduled for future implementation (some may have open PRs/RFCs). -- **🔴 Deprecated**: Not planned for v1 unless there is strong demand. +- **🟠 Delayed**: Temporarily dropped in V1 but planned to be re-introduced later. +- **🔴 Deprecated**: Not planned for V1 unless there is strong demand. **Note**: vLLM V1’s unified scheduler treats both prompt and output tokens the same -way by using a simple dictionary (e.g., {request_id: num_tokens}) to dynamically +way by using a simple dictionary (e.g., `{request_id: num_tokens}`) to dynamically allocate a fixed token budget per request, enabling features like chunked prefills, prefix caching, and speculative decoding without a strict separation between prefill and decode phases. @@ -140,7 +143,9 @@ vLLM V1 currently excludes model architectures with the `SupportsV0Only` protoco and the majority fall into the following categories. V1 support for these models will be added eventually. **Embedding Models** -Instead of having a separate model runner, hidden states processor [RFC #12249](https://github.com/vllm-project/vllm/issues/12249), which is based on global logits processor [RFC #13360](https://github.com/vllm-project/vllm/pull/13360), has been proposed to enable simultaneous generation and embedding using the same engine instance in V1. It is still in the planning stage. +Initially, we will create a [separate model runner](https://github.com/vllm-project/vllm/pull/18015) to provide V1 support without conflicting with other ongoing work. + +Later, we will consider using [hidden states processor](https://github.com/vllm-project/vllm/issues/12249), which is based on [global logits processor](https://github.com/vllm-project/vllm/pull/13360) to enable simultaneous generation and embedding using the same engine instance in V1. [PR #16188](https://github.com/vllm-project/vllm/pull/16188) is the first step towards enabling this. **Mamba Models** Models using selective state-space mechanisms (instead of standard transformer attention) From 4e88723f32f1115130566b31dba0d3c31ab1b13f Mon Sep 17 00:00:00 2001 From: youkaichao Date: Tue, 3 Jun 2025 21:42:17 +0800 Subject: [PATCH 018/115] [doc] clarify windows support (#19088) Signed-off-by: youkaichao --- docs/getting_started/installation/gpu.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/getting_started/installation/gpu.md b/docs/getting_started/installation/gpu.md index 3c983f600673d..f8a3acef784fc 100644 --- a/docs/getting_started/installation/gpu.md +++ b/docs/getting_started/installation/gpu.md @@ -19,6 +19,9 @@ vLLM is a Python library that supports the following GPU variants. Select your G - OS: Linux - Python: 3.9 -- 3.12 +!!! note + vLLM does not support Windows natively. To run vLLM on Windows, you can use the Windows Subsystem for Linux (WSL) with a compatible Linux distribution, or use some community-maintained forks, e.g. [https://github.com/SystemPanic/vllm-windows](https://github.com/SystemPanic/vllm-windows). + === "NVIDIA CUDA" --8<-- "docs/getting_started/installation/gpu/cuda.inc.md:requirements" From 4e68ae5e59b24fad3865eb34421b36bef4751888 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Tue, 3 Jun 2025 22:30:18 +0800 Subject: [PATCH 019/115] [CI/Build] Remove V0 LoRA test (#19066) Signed-off-by: Jee Jee Li --- tests/lora/test_add_lora.py | 21 ++----------------- tests/lora/test_chatglm3_tp.py | 10 --------- tests/lora/test_llama_tp.py | 8 -------- tests/lora/test_lora_functions.py | 34 ++++++++----------------------- tests/lora/test_mixtral.py | 8 -------- tests/lora/test_quant_model.py | 8 -------- tests/lora/test_qwen2vl.py | 8 -------- tests/lora/test_worker.py | 10 --------- 8 files changed, 10 insertions(+), 97 deletions(-) diff --git a/tests/lora/test_add_lora.py b/tests/lora/test_add_lora.py index c8b7a5cbf7470..17347300b40c8 100644 --- a/tests/lora/test_add_lora.py +++ b/tests/lora/test_add_lora.py @@ -6,6 +6,8 @@ import pytest import vllm.envs as env from vllm.engine.arg_utils import AsyncEngineArgs +from vllm.entrypoints.openai.api_server import ( + build_async_engine_client_from_engine_args) from vllm.inputs import TextPrompt from vllm.lora.request import LoRARequest from vllm.sampling_params import SamplingParams @@ -16,14 +18,6 @@ LORA_RANK = 64 DEFAULT_MAX_LORAS = 4 * 3 -@pytest.fixture(autouse=True) -def v1(run_with_both_engines_lora): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass - - def get_lora_requests(lora_path) -> list[LoRARequest]: lora_requests: list[LoRARequest] = [ LoRARequest(lora_name=f"{i}", lora_int_id=i, lora_path=lora_path) @@ -88,17 +82,6 @@ async def test_add_lora(chatglm3_lora_files): trust_remote_code=True, enforce_eager=True) - # The run_with_both_engines_lora fixture sets up the `VLLM_USE_V1` - # environment variable. reload vllm.enging.async_llm_engine as - # vllm.engine.async_llm_engine.AsyncLLMEgnine changes depending on the - # env var. - import importlib - - import vllm.engine.async_llm_engine - importlib.reload(vllm.engine.async_llm_engine) - from vllm.entrypoints.openai.api_server import ( - build_async_engine_client_from_engine_args) - # split lora_requests into 3 parts part_size = len(lora_requests) // 3 dummy_run_requests = lora_requests[:part_size] diff --git a/tests/lora/test_chatglm3_tp.py b/tests/lora/test_chatglm3_tp.py index 2c18a115be487..cd9526c8b1012 100644 --- a/tests/lora/test_chatglm3_tp.py +++ b/tests/lora/test_chatglm3_tp.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -import pytest - import vllm from vllm.lora.request import LoRARequest @@ -18,14 +16,6 @@ EXPECTED_LORA_OUTPUT = [ ] -@pytest.fixture(autouse=True) -def v1(run_with_both_engines_lora): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass - - def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: prompts = [ PROMPT_TEMPLATE.format(query="How many singers do we have?"), diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py index 580992dea53da..54daea5b9dbf0 100644 --- a/tests/lora/test_llama_tp.py +++ b/tests/lora/test_llama_tp.py @@ -33,14 +33,6 @@ EXPECTED_LORA_OUTPUT = [ ] -@pytest.fixture(autouse=True) -def v1(run_with_both_engines_lora): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass - - def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int, diff --git a/tests/lora/test_lora_functions.py b/tests/lora/test_lora_functions.py index 7ae33a848a0aa..fd80f61a59773 100644 --- a/tests/lora/test_lora_functions.py +++ b/tests/lora/test_lora_functions.py @@ -2,26 +2,24 @@ """ Script to test add_lora, remove_lora, pin_lora, list_loras functions. """ - -import os - import pytest from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs from vllm.engine.llm_engine import LLMEngine +from vllm.entrypoints.openai.api_server import ( + build_async_engine_client_from_engine_args) from vllm.lora.request import LoRARequest MODEL_PATH = "meta-llama/Llama-2-7b-hf" LORA_MODULE_PATH = "yard1/llama-2-7b-sql-lora-test" LORA_RANK = 8 - -@pytest.fixture(autouse=True) -def v1(run_with_both_engines_lora): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass +# @pytest.fixture(autouse=True) +# def v1(run_with_both_engines_lora): +# # Simple autouse wrapper to run both engines for each test +# # This can be promoted up to conftest.py to run for every +# # test in a package +# pass def make_lora_request(lora_id: int): @@ -79,22 +77,6 @@ def test_lora_functions_sync(): @pytest.mark.asyncio async def test_lora_functions_async(): - if os.getenv("VLLM_USE_V1") == "0": - pytest.skip( - reason= - "V0 AsyncLLMEngine does not expose remove/list/pin LoRA functions") - - # The run_with_both_engines_lora fixture sets up the `VLLM_USE_V1` - # environment variable. reload vllm.enging.async_llm_engine as - # vllm.engine.async_llm_engine.AsyncLLMEgnine changes depending on the - # env var. - import importlib - - import vllm.engine.async_llm_engine - importlib.reload(vllm.engine.async_llm_engine) - from vllm.entrypoints.openai.api_server import ( - build_async_engine_client_from_engine_args) - max_loras = 4 engine_args = AsyncEngineArgs(model=MODEL_PATH, enable_lora=True, diff --git a/tests/lora/test_mixtral.py b/tests/lora/test_mixtral.py index aea7691935dfe..4e77c5559e164 100644 --- a/tests/lora/test_mixtral.py +++ b/tests/lora/test_mixtral.py @@ -10,14 +10,6 @@ from vllm.platforms import current_platform MODEL_PATH = "mistralai/Mixtral-8x7B-Instruct-v0.1" -@pytest.fixture(autouse=True) -def v1(run_with_both_engines_lora): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass - - def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int, prompts: list[str]) -> list[str]: diff --git a/tests/lora/test_quant_model.py b/tests/lora/test_quant_model.py index 7a76ffb740ef2..43e2975cd87c0 100644 --- a/tests/lora/test_quant_model.py +++ b/tests/lora/test_quant_model.py @@ -37,14 +37,6 @@ else: ] -@pytest.fixture(autouse=True) -def v1(run_with_both_engines_lora): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass - - def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int, diff --git a/tests/lora/test_qwen2vl.py b/tests/lora/test_qwen2vl.py index 162714df2f130..20a1ae67db2dc 100644 --- a/tests/lora/test_qwen2vl.py +++ b/tests/lora/test_qwen2vl.py @@ -13,14 +13,6 @@ from vllm.platforms import current_platform from vllm.sampling_params import BeamSearchParams -@pytest.fixture(autouse=not current_platform.is_cpu()) -def v1(run_with_both_engines_lora): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass - - @dataclass class TestConfig: model_path: str diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py index e5ae660af1400..1a5d527164d0b 100644 --- a/tests/lora/test_worker.py +++ b/tests/lora/test_worker.py @@ -6,8 +6,6 @@ import tempfile from typing import Union from unittest.mock import patch -import pytest - import vllm.envs as envs from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, ModelConfig, ParallelConfig, SchedulerConfig, @@ -18,14 +16,6 @@ from vllm.v1.worker.gpu_worker import Worker as V1Worker from vllm.worker.worker import Worker -@pytest.fixture(autouse=True) -def v1(run_with_both_engines_lora): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass - - @patch.dict(os.environ, {"RANK": "0"}) def test_worker_apply_lora(sql_lora_files): From 476844d44cbf315c6c1e8431946bdecfe9823834 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 3 Jun 2025 15:39:24 +0100 Subject: [PATCH 020/115] Fix underscores in dict keys passed via CLI (#19030) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- tests/test_utils.py | 11 +++++++++++ vllm/utils.py | 13 ++++++++++--- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/tests/test_utils.py b/tests/test_utils.py index dd8777f068887..42e0df1ffb017 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -259,11 +259,18 @@ def test_dict_args(parser): "--model-name=something.something", "--hf-overrides.key1", "val1", + # Test nesting "--hf-overrides.key2.key3", "val2", "--hf-overrides.key2.key4", "val3", + # Test = sign "--hf-overrides.key5=val4", + # Test underscore to dash conversion + "--hf_overrides.key_6", + "val5", + "--hf_overrides.key-7.key_8", + "val6", ] parsed_args = parser.parse_args(args) assert parsed_args.model_name == "something.something" @@ -274,6 +281,10 @@ def test_dict_args(parser): "key4": "val3", }, "key5": "val4", + "key_6": "val5", + "key-7": { + "key_8": "val6", + }, } diff --git a/vllm/utils.py b/vllm/utils.py index c879b38d065aa..b4152e6b24700 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -1456,17 +1456,24 @@ class FlexibleArgumentParser(ArgumentParser): if '--config' in args: args = self._pull_args_from_config(args) + def repl(match: re.Match) -> str: + """Replaces underscores with dashes in the matched string.""" + return match.group(0).replace("_", "-") + + # Everything between the first -- and the first . + pattern = re.compile(r"(?<=--)[^\.]*") + # Convert underscores to dashes and vice versa in argument names processed_args = [] for arg in args: if arg.startswith('--'): if '=' in arg: key, value = arg.split('=', 1) - key = '--' + key[len('--'):].replace('_', '-') + key = pattern.sub(repl, key, count=1) processed_args.append(f'{key}={value}') else: - processed_args.append('--' + - arg[len('--'):].replace('_', '-')) + key = pattern.sub(repl, arg, count=1) + processed_args.append(key) elif arg.startswith('-O') and arg != '-O' and len(arg) == 2: # allow -O flag to be used without space, e.g. -O3 processed_args.append('-O') From d81edded69a5534a80785b68cde26c547cfcd4c6 Mon Sep 17 00:00:00 2001 From: Raushan Turganbay Date: Tue, 3 Jun 2025 17:06:04 +0200 Subject: [PATCH 021/115] [Bugfix] disable processor cache (#19068) Signed-off-by: raushan --- vllm/v1/engine/mm_input_cache.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/v1/engine/mm_input_cache.py b/vllm/v1/engine/mm_input_cache.py index fcb90bebdb627..45fb5cd23f60f 100644 --- a/vllm/v1/engine/mm_input_cache.py +++ b/vllm/v1/engine/mm_input_cache.py @@ -34,8 +34,8 @@ class MirroredProcessingCache: def __init__(self, model_config): mm_config = model_config.multimodal_config - disable_mm_preprocessor_cache = mm_config is not None and \ - not mm_config.disable_mm_preprocessor_cache + disable_mm_preprocessor_cache = ( + mm_config is not None and mm_config.disable_mm_preprocessor_cache) self.use_cache = not disable_mm_preprocessor_cache self.mm_cache = ProcessingCache.get_lru_cache(VLLM_MM_INPUT_CACHE_GIB, MultiModalKwargs) From d00dd65cd4dbc1ebbdbe2cd070ff694e9e9321a2 Mon Sep 17 00:00:00 2001 From: Lu Fang <30275821+houseroad@users.noreply.github.com> Date: Tue, 3 Jun 2025 23:44:34 +0800 Subject: [PATCH 022/115] [Doc] Improve the Pull Request template with key components (#19086) Signed-off-by: Lu Fang --- .github/PULL_REQUEST_TEMPLATE.md | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 65be771b94fb9..c1d1e07bf628f 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,6 +1,15 @@ -FILL IN THE PR DESCRIPTION HERE +## Essential Elements of an Effective PR Description Checklist +- [ ] The purpose of the PR, such as "Fix some issue (link existing issues this PR will resolve)". +- [ ] The test plan, such as providing test command. +- [ ] The test results, such as pasting the results comparison before and after, or e2e results -FIX #xxxx (*link existing issues this PR will resolve*) +PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS ABOVE HAVE BEEN CONSIDERED. + +## Purpose + +## Test Plan + +## Test Result **BEFORE SUBMITTING, PLEASE READ ** (anything written below this line will be removed by GitHub Actions) From 4b7817c119e27ad9b1e1930a34006eff9680a457 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= Date: Tue, 3 Jun 2025 18:15:16 +0200 Subject: [PATCH 023/115] [Misc] Add missing `_Backend` enums (#19081) Signed-off-by: nicklucche --- vllm/platforms/interface.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index 5c4f7a2f7dc76..c7a6272623576 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -47,6 +47,8 @@ class _Backend(enum.Enum): TORCH_SDPA = enum.auto() FLASHINFER = enum.auto() TRITON_MLA = enum.auto() # Supported by V1 + TRITON_MLA_VLLM_V1 = enum.auto() + FLASHMLA_VLLM_V1 = enum.auto() FLASHMLA = enum.auto() # Supported by V1 HPU_ATTN = enum.auto() PALLAS = enum.auto() From d054da1992175787f936d18aead51bef663a0399 Mon Sep 17 00:00:00 2001 From: CYJiang <86391540+googs1025@users.noreply.github.com> Date: Wed, 4 Jun 2025 02:02:07 +0800 Subject: [PATCH 024/115] [Misc] fix: add miss best_of param validation (#18555) Signed-off-by: googs1025 --- vllm/sampling_params.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py index dc38daa388ced..4294465f68fcf 100644 --- a/vllm/sampling_params.py +++ b/vllm/sampling_params.py @@ -389,6 +389,17 @@ class SamplingParams( f"type {type(self.n)}") if self.n < 1: raise ValueError(f"n must be at least 1, got {self.n}.") + if self.best_of is not None: + if not isinstance(self.best_of, int): + raise ValueError( + f"best_of must be an integer, got {type(self.best_of)}") + if self.best_of < 1: + raise ValueError( + f"best_of must be at least 1, got {self.best_of}") + if self.best_of < self.n: + raise ValueError( + f"best_of must be greater than or equal to n, " + f"got n={self.n} and best_of={self.best_of}.") if not -2.0 <= self.presence_penalty <= 2.0: raise ValueError("presence_penalty must be in [-2, 2], got " f"{self.presence_penalty}.") From 02f0c7b220422792f5e53de2a7d51d2d3ff2df28 Mon Sep 17 00:00:00 2001 From: Simon Mo Date: Tue, 3 Jun 2025 11:20:17 -0700 Subject: [PATCH 025/115] [Misc] Add SPDX-FileCopyrightText (#19100) Signed-off-by: simon-mo --- .buildkite/check-wheel-size.py | 1 + .buildkite/generate_index.py | 1 + .buildkite/lm-eval-harness/conftest.py | 1 + .buildkite/lm-eval-harness/test_lm_eval_correctness.py | 1 + .../scripts/convert-results-json-to-markdown.py | 1 + .buildkite/nightly-benchmarks/scripts/download-tokenizer.py | 1 + .../nightly-benchmarks/scripts/generate-nightly-markdown.py | 1 + .../nightly-benchmarks/scripts/get-lmdeploy-modelname.py | 1 + .../nightly-benchmarks/scripts/summary-nightly-results.py | 1 + benchmarks/backend_request_func.py | 1 + benchmarks/benchmark_dataset.py | 1 + benchmarks/benchmark_latency.py | 1 + benchmarks/benchmark_long_document_qa_throughput.py | 1 + benchmarks/benchmark_prefix_caching.py | 1 + benchmarks/benchmark_prioritization.py | 1 + benchmarks/benchmark_serving.py | 1 + benchmarks/benchmark_serving_structured_output.py | 1 + benchmarks/benchmark_throughput.py | 1 + benchmarks/benchmark_utils.py | 1 + benchmarks/cutlass_benchmarks/sparse_benchmarks.py | 1 + benchmarks/cutlass_benchmarks/utils.py | 1 + benchmarks/cutlass_benchmarks/w8a8_benchmarks.py | 1 + benchmarks/cutlass_benchmarks/weight_shapes.py | 1 + benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py | 1 + benchmarks/disagg_benchmarks/round_robin_proxy.py | 1 + benchmarks/disagg_benchmarks/visualize_benchmark_results.py | 1 + benchmarks/fused_kernels/layernorm_rms_benchmarks.py | 1 + benchmarks/kernels/bench_fp8_gemm.py | 1 + benchmarks/kernels/benchmark_aqlm.py | 1 + benchmarks/kernels/benchmark_bitblas.py | 1 + benchmarks/kernels/benchmark_cutlass_fp4_moe.py | 1 + benchmarks/kernels/benchmark_grouped_gemm_cutlass.py | 1 + benchmarks/kernels/benchmark_layernorm.py | 1 + benchmarks/kernels/benchmark_lora.py | 1 + benchmarks/kernels/benchmark_machete.py | 1 + benchmarks/kernels/benchmark_marlin.py | 1 + benchmarks/kernels/benchmark_moe.py | 1 + benchmarks/kernels/benchmark_moe_permute_unpermute.py | 1 + benchmarks/kernels/benchmark_paged_attention.py | 1 + benchmarks/kernels/benchmark_quant.py | 1 + benchmarks/kernels/benchmark_rmsnorm.py | 1 + benchmarks/kernels/benchmark_rope.py | 1 + benchmarks/kernels/benchmark_shapes.py | 1 + benchmarks/kernels/benchmark_w8a8_block_fp8.py | 1 + .../kernels/deepgemm/benchmark_fp8_block_dense_gemm.py | 1 + benchmarks/kernels/graph_machete_bench.py | 1 + benchmarks/kernels/utils.py | 1 + benchmarks/kernels/weight_shapes.py | 1 + benchmarks/overheads/benchmark_hashing.py | 1 + cmake/hipify.py | 1 + csrc/cutlass_extensions/vllm_cutlass_library_extension.py | 1 + csrc/moe/marlin_moe_wna16/generate_kernels.py | 1 + csrc/quantization/gptq_marlin/generate_kernels.py | 1 + csrc/quantization/machete/generate.py | 1 + docs/mkdocs/hooks/generate_examples.py | 1 + docs/mkdocs/hooks/remove_announcement.py | 1 + docs/mkdocs/hooks/url_schemes.py | 1 + examples/offline_inference/audio_language.py | 1 + examples/offline_inference/automatic_prefix_caching.py | 1 + examples/offline_inference/basic/basic.py | 1 + examples/offline_inference/basic/chat.py | 1 + examples/offline_inference/basic/classify.py | 1 + examples/offline_inference/basic/embed.py | 1 + examples/offline_inference/basic/generate.py | 1 + examples/offline_inference/basic/score.py | 1 + examples/offline_inference/batch_llm_inference.py | 1 + examples/offline_inference/chat_with_tools.py | 1 + examples/offline_inference/context_extension.py | 1 + examples/offline_inference/data_parallel.py | 1 + .../disaggregated-prefill-v1/decode_example.py | 1 + .../disaggregated-prefill-v1/prefill_example.py | 1 + examples/offline_inference/disaggregated_prefill.py | 1 + examples/offline_inference/eagle.py | 1 + examples/offline_inference/embed_jina_embeddings_v3.py | 1 + examples/offline_inference/embed_matryoshka_fy.py | 1 + examples/offline_inference/encoder_decoder.py | 1 + examples/offline_inference/encoder_decoder_multimodal.py | 1 + examples/offline_inference/llm_engine_example.py | 1 + examples/offline_inference/load_sharded_state.py | 1 + .../offline_inference/lora_with_quantization_inference.py | 1 + examples/offline_inference/metrics.py | 1 + examples/offline_inference/mistral-small.py | 1 + examples/offline_inference/mlpspeculator.py | 1 + examples/offline_inference/multilora_inference.py | 1 + examples/offline_inference/neuron.py | 1 + examples/offline_inference/neuron_eagle.py | 1 + examples/offline_inference/neuron_int8_quantization.py | 1 + examples/offline_inference/neuron_multimodal.py | 1 + examples/offline_inference/neuron_speculation.py | 1 + examples/offline_inference/prefix_caching.py | 1 + examples/offline_inference/prithvi_geospatial_mae.py | 1 + examples/offline_inference/profiling.py | 1 + examples/offline_inference/profiling_tpu/profiling.py | 1 + examples/offline_inference/prompt_embed_inference.py | 1 + examples/offline_inference/qwen2_5_omni/only_thinker.py | 1 + examples/offline_inference/qwen_1m.py | 1 + examples/offline_inference/reproducibility.py | 1 + examples/offline_inference/rlhf.py | 1 + examples/offline_inference/rlhf_colocate.py | 1 + examples/offline_inference/rlhf_utils.py | 1 + examples/offline_inference/save_sharded_state.py | 1 + examples/offline_inference/simple_profiling.py | 1 + examples/offline_inference/structured_outputs.py | 1 + examples/offline_inference/torchrun_example.py | 1 + examples/offline_inference/tpu.py | 1 + examples/offline_inference/vision_language.py | 1 + examples/offline_inference/vision_language_embedding.py | 1 + examples/offline_inference/vision_language_multi_image.py | 1 + examples/online_serving/api_client.py | 1 + examples/online_serving/cohere_rerank_client.py | 1 + .../disaggregated_serving/disagg_proxy_demo.py | 1 + examples/online_serving/gradio_openai_chatbot_webserver.py | 1 + examples/online_serving/gradio_webserver.py | 1 + examples/online_serving/jinaai_rerank_client.py | 1 + examples/online_serving/kv_events_subscriber.py | 1 + examples/online_serving/openai_chat_completion_client.py | 1 + .../openai_chat_completion_client_for_multimodal.py | 1 + .../openai_chat_completion_client_with_tools.py | 1 + .../openai_chat_completion_client_with_tools_required.py | 1 + .../openai_chat_completion_structured_outputs.py | 1 + ...nai_chat_completion_structured_outputs_structural_tag.py | 1 + ...nai_chat_completion_structured_outputs_with_reasoning.py | 1 + .../openai_chat_completion_tool_calls_with_reasoning.py | 1 + .../online_serving/openai_chat_completion_with_reasoning.py | 1 + .../openai_chat_completion_with_reasoning_streaming.py | 1 + .../openai_chat_embedding_client_for_multimodal.py | 1 + examples/online_serving/openai_classification_client.py | 1 + examples/online_serving/openai_completion_client.py | 1 + examples/online_serving/openai_cross_encoder_score.py | 1 + examples/online_serving/openai_embedding_client.py | 1 + examples/online_serving/openai_embedding_matryoshka_fy.py | 1 + examples/online_serving/openai_pooling_client.py | 1 + examples/online_serving/openai_transcription_client.py | 1 + examples/online_serving/opentelemetry/dummy_client.py | 1 + .../prompt_embed_inference_with_openai_client.py | 1 + examples/online_serving/ray_serve_deepseek.py | 1 + .../retrieval_augmented_generation_with_langchain.py | 1 + .../retrieval_augmented_generation_with_llamaindex.py | 1 + .../online_serving/streamlit_openai_chatbot_webserver.py | 1 + examples/online_serving/utils.py | 1 + examples/others/lmcache/cpu_offload_lmcache.py | 1 + examples/others/lmcache/disagg_prefill_lmcache_v0.py | 1 + .../disagg_prefill_lmcache_v1/disagg_proxy_server.py | 1 + examples/others/lmcache/kv_cache_sharing_lmcache_v1.py | 1 + examples/others/tensorize_vllm_model.py | 1 + find_cuda_init.py | 1 + setup.py | 1 + tests/async_engine/api_server_async_engine.py | 1 + tests/async_engine/conftest.py | 1 + tests/async_engine/test_api_server.py | 1 + tests/async_engine/test_async_llm_engine.py | 1 + tests/async_engine/test_request_tracker.py | 1 + tests/basic_correctness/test_basic_correctness.py | 1 + tests/basic_correctness/test_chunked_prefill.py | 1 + tests/basic_correctness/test_cpu_offload.py | 1 + tests/basic_correctness/test_cumem.py | 1 + tests/basic_correctness/test_preemption.py | 1 + tests/benchmarks/test_latency_cli.py | 1 + tests/benchmarks/test_serve_cli.py | 1 + tests/benchmarks/test_throughput_cli.py | 1 + tests/build_cython.py | 1 + tests/compile/backend.py | 1 + tests/compile/conftest.py | 1 + tests/compile/piecewise/test_full_cudagraph.py | 1 + tests/compile/piecewise/test_simple.py | 1 + tests/compile/piecewise/test_toy_llama.py | 1 + tests/compile/test_async_tp.py | 1 + tests/compile/test_basic_correctness.py | 1 + tests/compile/test_full_graph.py | 1 + tests/compile/test_functionalization.py | 1 + tests/compile/test_fusion.py | 1 + tests/compile/test_pass_manager.py | 1 + tests/compile/test_sequence_parallelism.py | 1 + tests/compile/test_silu_mul_quant_fusion.py | 1 + tests/compile/test_wrapper.py | 1 + tests/conftest.py | 1 + tests/core/block/conftest.py | 1 + tests/core/block/e2e/conftest.py | 1 + tests/core/block/e2e/test_correctness.py | 1 + tests/core/block/e2e/test_correctness_sliding_window.py | 1 + tests/core/block/test_block_manager.py | 1 + tests/core/block/test_block_table.py | 1 + tests/core/block/test_common.py | 1 + tests/core/block/test_cpu_gpu_block_allocator.py | 1 + tests/core/block/test_naive_block.py | 1 + tests/core/block/test_prefix_caching_block.py | 1 + tests/core/conftest.py | 1 + tests/core/test_chunked_prefill_scheduler.py | 1 + tests/core/test_num_computed_tokens_update.py | 1 + tests/core/test_scheduler.py | 1 + tests/core/test_scheduler_encoder_decoder.py | 1 + tests/core/test_serialization.py | 1 + tests/core/utils.py | 1 + tests/detokenizer/conftest.py | 1 + tests/detokenizer/test_disable_detokenization.py | 1 + tests/detokenizer/test_stop_checker.py | 1 + tests/detokenizer/test_stop_reason.py | 1 + tests/detokenizer/test_stop_strings.py | 1 + tests/distributed/conftest.py | 1 + tests/distributed/test_ca_buffer_sharing.py | 1 + tests/distributed/test_comm_ops.py | 1 + tests/distributed/test_custom_all_reduce.py | 1 + tests/distributed/test_distributed_oot.py | 1 + tests/distributed/test_events.py | 1 + tests/distributed/test_expert_parallel.py | 1 + tests/distributed/test_multi_node_assignment.py | 1 + tests/distributed/test_pipeline_parallel.py | 1 + tests/distributed/test_pipeline_partition.py | 1 + tests/distributed/test_pp_cudagraph.py | 1 + tests/distributed/test_pynccl.py | 1 + tests/distributed/test_same_node.py | 1 + tests/distributed/test_sequence_parallel.py | 1 + tests/distributed/test_shm_broadcast.py | 1 + tests/distributed/test_torchrun_example.py | 1 + tests/distributed/test_utils.py | 1 + tests/encoder_decoder/test_e2e_correctness.py | 1 + tests/engine/conftest.py | 1 + tests/engine/test_arg_utils.py | 1 + tests/engine/test_computed_prefix_blocks.py | 1 + tests/engine/test_executor.py | 1 + tests/engine/test_multi_step_output_processor.py | 1 + tests/engine/test_multiproc_workers.py | 1 + tests/engine/test_options.py | 1 + tests/engine/test_short_mm_context.py | 1 + tests/entrypoints/conftest.py | 1 + tests/entrypoints/llm/test_accuracy.py | 1 + tests/entrypoints/llm/test_chat.py | 1 + tests/entrypoints/llm/test_collective_rpc.py | 1 + tests/entrypoints/llm/test_encode.py | 1 + tests/entrypoints/llm/test_generate.py | 1 + tests/entrypoints/llm/test_generate_multiple_loras.py | 1 + tests/entrypoints/llm/test_gpu_utilization.py | 1 + tests/entrypoints/llm/test_guided_generate.py | 1 + tests/entrypoints/llm/test_lazy_outlines.py | 1 + tests/entrypoints/llm/test_prompt_validation.py | 1 + tests/entrypoints/offline_mode/test_offline_mode.py | 1 + tests/entrypoints/openai/correctness/test_lmeval.py | 1 + tests/entrypoints/openai/correctness/test_mteb.py | 1 + .../correctness/test_transcription_api_correctness.py | 1 + tests/entrypoints/openai/test_async_tokenization.py | 1 + tests/entrypoints/openai/test_audio.py | 1 + tests/entrypoints/openai/test_basic.py | 1 + tests/entrypoints/openai/test_chat.py | 1 + tests/entrypoints/openai/test_chat_echo.py | 1 + tests/entrypoints/openai/test_chat_logit_bias_validation.py | 1 + tests/entrypoints/openai/test_chat_template.py | 1 + tests/entrypoints/openai/test_chat_with_tool_reasoning.py | 1 + tests/entrypoints/openai/test_chunked_prompt.py | 1 + tests/entrypoints/openai/test_classification.py | 1 + tests/entrypoints/openai/test_cli_args.py | 1 + tests/entrypoints/openai/test_completion.py | 1 + .../openai/test_completion_with_function_calling.py | 1 + .../openai/test_completion_with_prompt_embeds.py | 1 + tests/entrypoints/openai/test_embedding.py | 1 + tests/entrypoints/openai/test_embedding_dimensions.py | 1 + tests/entrypoints/openai/test_encoder_decoder.py | 1 + tests/entrypoints/openai/test_lora_adapters.py | 1 + tests/entrypoints/openai/test_lora_resolvers.py | 1 + tests/entrypoints/openai/test_metrics.py | 1 + tests/entrypoints/openai/test_models.py | 1 + tests/entrypoints/openai/test_oot_registration.py | 1 + tests/entrypoints/openai/test_openai_schema.py | 1 + tests/entrypoints/openai/test_pooling.py | 1 + tests/entrypoints/openai/test_prompt_validation.py | 1 + tests/entrypoints/openai/test_rerank.py | 1 + tests/entrypoints/openai/test_return_tokens_as_ids.py | 1 + tests/entrypoints/openai/test_root_path.py | 1 + tests/entrypoints/openai/test_run_batch.py | 1 + tests/entrypoints/openai/test_score.py | 1 + tests/entrypoints/openai/test_serving_chat.py | 1 + tests/entrypoints/openai/test_serving_models.py | 1 + tests/entrypoints/openai/test_shutdown.py | 1 + tests/entrypoints/openai/test_sleep.py | 1 + tests/entrypoints/openai/test_tensorizer_entrypoint.py | 1 + tests/entrypoints/openai/test_tokenization.py | 1 + tests/entrypoints/openai/test_transcription_validation.py | 1 + tests/entrypoints/openai/test_truncation.py | 1 + tests/entrypoints/openai/test_video.py | 1 + tests/entrypoints/openai/test_vision.py | 1 + tests/entrypoints/openai/test_vision_embedding.py | 1 + .../openai/tool_parsers/test_llama4_pythonic_tool_parser.py | 1 + .../openai/tool_parsers/test_pythonic_tool_parser.py | 1 + tests/entrypoints/openai/tool_parsers/utils.py | 1 + tests/entrypoints/test_api_server_process_manager.py | 1 + tests/entrypoints/test_chat_utils.py | 1 + tests/entrypoints/test_ssl_cert_refresher.py | 1 + tests/fastsafetensors_loader/test_fastsafetensors_loader.py | 1 + tests/fastsafetensors_loader/test_weight_utils.py | 1 + tests/kernels/allclose_default.py | 1 + tests/kernels/attention/conftest.py | 1 + tests/kernels/attention/test_attention.py | 1 + tests/kernels/attention/test_attention_selector.py | 1 + tests/kernels/attention/test_blocksparse_attention.py | 1 + tests/kernels/attention/test_cache.py | 1 + tests/kernels/attention/test_cascade_flash_attn.py | 1 + tests/kernels/attention/test_encoder_decoder_attn.py | 1 + tests/kernels/attention/test_flash_attn.py | 1 + tests/kernels/attention/test_flashinfer.py | 1 + tests/kernels/attention/test_flashmla.py | 1 + tests/kernels/attention/test_lightning_attn.py | 1 + tests/kernels/attention/test_merge_attn_states.py | 1 + tests/kernels/attention/test_mha_attn.py | 1 + tests/kernels/attention/test_mla_decode_cpu.py | 1 + tests/kernels/attention/test_prefix_prefill.py | 1 + tests/kernels/attention/test_rocm_attention_selector.py | 1 + tests/kernels/attention/test_triton_decode_attention.py | 1 + tests/kernels/attention/test_triton_unified_attention.py | 1 + tests/kernels/core/test_activation.py | 1 + tests/kernels/core/test_fused_quant_layernorm.py | 1 + tests/kernels/core/test_layernorm.py | 1 + tests/kernels/core/test_opcheck.py | 1 + tests/kernels/core/test_permute_cols.py | 1 + tests/kernels/core/test_pos_encoding.py | 1 + tests/kernels/core/test_rotary_embedding.py | 1 + tests/kernels/core/test_uva.py | 1 + tests/kernels/mamba/test_causal_conv1d.py | 1 + tests/kernels/mamba/test_mamba_mixer2.py | 1 + tests/kernels/mamba/test_mamba_ssm.py | 1 + tests/kernels/mamba/test_mamba_ssm_ssd.py | 1 + tests/kernels/moe/test_batched_moe.py | 1 + tests/kernels/moe/test_cutlass_moe.py | 1 + tests/kernels/moe/test_moe.py | 1 + tests/kernels/moe/test_moe_permute_unpermute.py | 1 + tests/kernels/moe/test_nvfp4_moe.py | 1 + tests/kernels/moe/test_pplx_moe.py | 1 + tests/kernels/moe/test_rocm_aiter_topk.py | 1 + tests/kernels/moe/test_triton_moe_ptpc_fp8.py | 1 + tests/kernels/quant_utils.py | 1 + tests/kernels/quantization/nvfp4_utils.py | 1 + tests/kernels/quantization/test_allspark_gemm.py | 1 + tests/kernels/quantization/test_aqlm.py | 1 + tests/kernels/quantization/test_awq.py | 1 + tests/kernels/quantization/test_awq_triton.py | 1 + tests/kernels/quantization/test_block_fp8.py | 1 + tests/kernels/quantization/test_block_int8.py | 1 + tests/kernels/quantization/test_cutlass_2of4_sparse.py | 1 + tests/kernels/quantization/test_cutlass_scaled_mm.py | 1 + tests/kernels/quantization/test_fp8_quant.py | 1 + tests/kernels/quantization/test_ggml.py | 1 + tests/kernels/quantization/test_gguf.py | 1 + tests/kernels/quantization/test_gptq.py | 1 + tests/kernels/quantization/test_int8_kernel.py | 1 + tests/kernels/quantization/test_int8_quant.py | 1 + tests/kernels/quantization/test_machete_mm.py | 1 + tests/kernels/quantization/test_marlin_gemm.py | 1 + tests/kernels/quantization/test_nvfp4_quant.py | 1 + tests/kernels/quantization/test_nvfp4_scaled_mm.py | 1 + tests/kernels/quantization/test_rocm_skinny_gemms.py | 1 + tests/kernels/quantization/test_triton_scaled_mm.py | 1 + tests/kernels/test_cutlass_mla_decode.py | 1 + tests/kernels/test_fused_quant_activation.py | 1 + tests/kernels/test_triton_flash_attention.py | 1 + tests/kernels/utils.py | 1 + tests/kv_transfer/test_disagg.py | 1 + tests/kv_transfer/test_lookup_buffer.py | 1 + tests/kv_transfer/test_module.py | 1 + tests/kv_transfer/test_send_recv.py | 1 + tests/lora/conftest.py | 1 + tests/lora/test_add_lora.py | 1 + tests/lora/test_baichuan.py | 1 + tests/lora/test_chatglm3_tp.py | 1 + tests/lora/test_layers.py | 1 + tests/lora/test_llama_tp.py | 1 + tests/lora/test_lora_allowed_token_ids.py | 1 + tests/lora/test_lora_checkpoints.py | 1 + tests/lora/test_lora_functions.py | 1 + tests/lora/test_lora_huggingface.py | 1 + tests/lora/test_lora_manager.py | 1 + tests/lora/test_minicpmv_tp.py | 1 + tests/lora/test_mixtral.py | 1 + tests/lora/test_peft_helper.py | 1 + tests/lora/test_phi.py | 1 + tests/lora/test_punica_ops.py | 1 + tests/lora/test_quant_model.py | 1 + tests/lora/test_qwen2vl.py | 1 + tests/lora/test_resolver.py | 1 + tests/lora/test_tokenizer_group.py | 1 + tests/lora/test_transfomers_model.py | 1 + tests/lora/test_utils.py | 1 + tests/lora/test_worker.py | 1 + tests/lora/utils.py | 1 + tests/metrics/test_metrics.py | 1 + tests/mistral_tool_use/conftest.py | 1 + tests/mistral_tool_use/test_mistral_tool_calls.py | 1 + tests/mistral_tool_use/utils.py | 1 + tests/model_executor/conftest.py | 1 + tests/model_executor/test_enabled_custom_ops.py | 1 + tests/model_executor/test_guided_processors.py | 1 + tests/model_executor/test_logits_processor.py | 1 + tests/model_executor/test_model_load_with_params.py | 1 + tests/model_executor/test_weight_utils.py | 1 + tests/models/language/generation/test_bart.py | 1 + tests/models/language/generation/test_common.py | 1 + tests/models/language/generation/test_granite.py | 1 + tests/models/language/generation/test_granitemoehybrid.py | 1 + tests/models/language/generation/test_hybrid.py | 1 + tests/models/language/generation/test_mistral.py | 1 + tests/models/language/generation/test_phimoe.py | 1 + tests/models/language/pooling/embed_utils.py | 1 + tests/models/language/pooling/mteb_utils.py | 1 + tests/models/language/pooling/test_baai.py | 1 + tests/models/language/pooling/test_classification.py | 1 + tests/models/language/pooling/test_embedding.py | 1 + tests/models/language/pooling/test_gritlm.py | 1 + tests/models/language/pooling/test_gte.py | 1 + tests/models/language/pooling/test_jina.py | 1 + tests/models/language/pooling/test_nomic.py | 1 + tests/models/language/pooling/test_nomic_max_model_len.py | 1 + tests/models/language/pooling/test_scoring.py | 1 + .../models/language/pooling/test_snowflake_arctic_embed.py | 1 + tests/models/language/pooling/test_truncation_control.py | 1 + tests/models/multimodal/generation/test_common.py | 1 + tests/models/multimodal/generation/test_florence2.py | 1 + tests/models/multimodal/generation/test_granite_speech.py | 1 + tests/models/multimodal/generation/test_interleaved.py | 1 + tests/models/multimodal/generation/test_mllama.py | 1 + tests/models/multimodal/generation/test_phi4mm.py | 1 + tests/models/multimodal/generation/test_pixtral.py | 1 + tests/models/multimodal/generation/test_qwen2_vl.py | 1 + tests/models/multimodal/generation/test_ultravox.py | 1 + tests/models/multimodal/generation/test_whisper.py | 1 + tests/models/multimodal/generation/vlm_utils/builders.py | 1 + .../multimodal/generation/vlm_utils/case_filtering.py | 1 + tests/models/multimodal/generation/vlm_utils/core.py | 1 + .../models/multimodal/generation/vlm_utils/custom_inputs.py | 1 + tests/models/multimodal/generation/vlm_utils/model_utils.py | 1 + tests/models/multimodal/generation/vlm_utils/runners.py | 1 + tests/models/multimodal/generation/vlm_utils/types.py | 1 + tests/models/multimodal/pooling/test_dse_qwen2_vl.py | 1 + tests/models/multimodal/pooling/test_intern_vit.py | 1 + tests/models/multimodal/pooling/test_llava_next.py | 1 + tests/models/multimodal/pooling/test_phi3v.py | 1 + tests/models/multimodal/processing/test_common.py | 1 + tests/models/multimodal/processing/test_h2ovl.py | 1 + tests/models/multimodal/processing/test_idefics3.py | 1 + tests/models/multimodal/processing/test_internvl.py | 1 + tests/models/multimodal/processing/test_llama4.py | 1 + tests/models/multimodal/processing/test_llava_next.py | 1 + tests/models/multimodal/processing/test_llava_onevision.py | 1 + tests/models/multimodal/processing/test_minimax_vl_01.py | 1 + tests/models/multimodal/processing/test_mllama.py | 1 + tests/models/multimodal/processing/test_phi3v.py | 1 + tests/models/multimodal/processing/test_phi4mm.py | 1 + tests/models/multimodal/processing/test_qwen2_vl.py | 1 + tests/models/multimodal/processing/test_smolvlm.py | 1 + tests/models/quantization/test_aqlm.py | 1 + tests/models/quantization/test_awq.py | 1 + tests/models/quantization/test_bitblas.py | 1 + tests/models/quantization/test_fp8.py | 1 + tests/models/quantization/test_gguf.py | 1 + tests/models/quantization/test_gptq_bitblas.py | 1 + tests/models/quantization/test_gptq_marlin.py | 1 + tests/models/quantization/test_gptq_marlin_24.py | 1 + tests/models/quantization/test_modelopt.py | 1 + tests/models/quantization/test_mxfp4.py | 1 + tests/models/quantization/test_nvfp4.py | 1 + tests/models/registry.py | 1 + tests/models/test_initialization.py | 1 + tests/models/test_oot_registration.py | 1 + tests/models/test_registry.py | 1 + tests/models/test_transformers.py | 1 + tests/models/test_utils.py | 1 + tests/models/test_vision.py | 1 + tests/models/utils.py | 1 + tests/mq_llm_engine/conftest.py | 1 + tests/mq_llm_engine/test_abort.py | 1 + tests/mq_llm_engine/test_error_handling.py | 1 + tests/mq_llm_engine/test_load.py | 1 + tests/mq_llm_engine/utils.py | 1 + tests/multi_step/test_correctness_async_llm.py | 1 + tests/multi_step/test_correctness_llm.py | 1 + tests/multimodal/test_hasher.py | 1 + tests/multimodal/test_image.py | 1 + tests/multimodal/test_inputs.py | 1 + tests/multimodal/test_processing.py | 1 + tests/multimodal/test_utils.py | 1 + tests/multimodal/test_video.py | 1 + tests/multimodal/utils.py | 1 + tests/neuron/1_core/test_activation.py | 1 + tests/neuron/1_core/test_block_table.py | 1 + tests/neuron/1_core/test_cache.py | 1 + tests/neuron/1_core/test_layernorm.py | 1 + tests/neuron/1_core/test_logits_processor.py | 1 + tests/neuron/1_core/test_neuron_model_runner.py | 1 + tests/neuron/1_core/test_neuron_quant.py | 1 + tests/neuron/1_core/test_prefix_prefill.py | 1 + tests/neuron/1_core/test_rotary_embedding.py | 1 + tests/neuron/2_core/test_comm_ops.py | 1 + tests/neuron/2_core/test_eagle.py | 1 + tests/neuron/2_core/test_mistral.py | 1 + tests/neuron/2_core/test_multi_lora.py | 1 + tests/plugins/lora_resolvers/test_filesystem_resolver.py | 1 + tests/plugins/vllm_add_dummy_model/setup.py | 1 + .../vllm_add_dummy_model/vllm_add_dummy_model/__init__.py | 1 + .../vllm_add_dummy_model/my_gemma_embedding.py | 1 + .../vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py | 1 + .../vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py | 1 + tests/plugins/vllm_add_dummy_platform/setup.py | 1 + .../vllm_add_dummy_platform/__init__.py | 1 + .../vllm_add_dummy_platform/dummy_attention_backend.py | 1 + .../vllm_add_dummy_platform/dummy_platform.py | 1 + tests/plugins_tests/conftest.py | 1 + tests/plugins_tests/test_platform_plugins.py | 1 + tests/plugins_tests/test_scheduler_plugins.py | 1 + tests/prefix_caching/test_disable_sliding_window.py | 1 + tests/prefix_caching/test_prefix_caching.py | 1 + tests/prompt_adapter/test_bloom.py | 1 + tests/prompt_adapter/test_multi_adapter_inference.py | 1 + tests/prompt_adapter/test_pa_lora.py | 1 + tests/quantization/test_auto_round.py | 1 + tests/quantization/test_bitsandbytes.py | 1 + tests/quantization/test_compressed_tensors.py | 1 + tests/quantization/test_configs.py | 1 + tests/quantization/test_cpu_offload.py | 3 ++- tests/quantization/test_experts_int8.py | 1 + tests/quantization/test_fp8.py | 1 + tests/quantization/test_gptq_dynamic.py | 1 + tests/quantization/test_ipex_quant.py | 1 + tests/quantization/test_lm_head.py | 1 + tests/quantization/test_ptpc_fp8.py | 1 + tests/quantization/test_quark.py | 1 + tests/quantization/test_register_quantization_config.py | 1 + tests/quantization/test_torchao.py | 1 + tests/quantization/utils.py | 1 + tests/reasoning/test_deepseekr1_reasoning_parser.py | 1 + tests/reasoning/test_granite_reasoning_parser.py | 1 + tests/reasoning/test_qwen3_reasoning_parser.py | 1 + tests/reasoning/utils.py | 1 + .../test_runai_model_streamer_loader.py | 1 + tests/runai_model_streamer_test/test_weight_utils.py | 1 + tests/samplers/test_beam_search.py | 1 + tests/samplers/test_ignore_eos.py | 1 + tests/samplers/test_logits_processor.py | 1 + tests/samplers/test_logprobs.py | 1 + tests/samplers/test_no_bad_words.py | 1 + tests/samplers/test_ranks.py | 1 + tests/samplers/test_rejection_sampler.py | 1 + tests/samplers/test_sampler.py | 1 + tests/samplers/test_seeded_generate.py | 1 + tests/samplers/test_typical_acceptance_sampler.py | 1 + tests/spec_decode/conftest.py | 1 + tests/spec_decode/e2e/conftest.py | 1 + tests/spec_decode/e2e/test_compatibility.py | 1 + tests/spec_decode/e2e/test_eagle_correctness.py | 1 + tests/spec_decode/e2e/test_integration.py | 1 + tests/spec_decode/e2e/test_integration_dist_tp2.py | 1 + tests/spec_decode/e2e/test_integration_dist_tp4.py | 1 + tests/spec_decode/e2e/test_logprobs.py | 1 + tests/spec_decode/e2e/test_medusa_correctness.py | 1 + tests/spec_decode/e2e/test_mlp_correctness.py | 1 + tests/spec_decode/e2e/test_mtp_correctness.py | 1 + tests/spec_decode/e2e/test_multistep_correctness.py | 1 + tests/spec_decode/e2e/test_ngram_correctness.py | 1 + tests/spec_decode/e2e/test_seed.py | 1 + tests/spec_decode/test_batch_expansion.py | 1 + tests/spec_decode/test_dynamic_spec_decode.py | 1 + tests/spec_decode/test_memory_usage.py | 1 + tests/spec_decode/test_metrics.py | 1 + tests/spec_decode/test_multi_step_worker.py | 1 + tests/spec_decode/test_ngram_worker.py | 1 + tests/spec_decode/test_scorer.py | 1 + tests/spec_decode/test_spec_decode_worker.py | 1 + tests/spec_decode/test_utils.py | 1 + tests/spec_decode/utils.py | 1 + tests/standalone_tests/lazy_imports.py | 1 + tests/tensorizer_loader/conftest.py | 1 + tests/tensorizer_loader/test_tensorizer.py | 1 + tests/test_cache_block_hashing.py | 1 + tests/test_config.py | 1 + tests/test_embedded_commit.py | 1 + tests/test_inputs.py | 1 + tests/test_logger.py | 1 + tests/test_outputs.py | 1 + tests/test_regression.py | 1 + tests/test_sampling_params.py | 1 + tests/test_scalartype.py | 1 + tests/test_seed_behavior.py | 3 ++- tests/test_sequence.py | 1 + tests/test_sharded_state_loader.py | 1 + tests/test_triton_utils.py | 1 + tests/test_utils.py | 1 + tests/test_version.py | 1 + tests/test_vllm_port.py | 1 + tests/tokenization/test_cached_tokenizer.py | 1 + tests/tokenization/test_detokenize.py | 1 + tests/tokenization/test_get_eos.py | 1 + tests/tokenization/test_mistral_tokenizer.py | 1 + tests/tokenization/test_tokenizer.py | 1 + tests/tokenization/test_tokenizer_group.py | 1 + tests/tokenization/test_tokenizer_registry.py | 1 + tests/tool_use/conftest.py | 1 + tests/tool_use/test_chat_completion_request_validations.py | 1 + tests/tool_use/test_chat_completions.py | 1 + tests/tool_use/test_jamba_tool_parser.py | 1 + tests/tool_use/test_parallel_tool_calls.py | 1 + tests/tool_use/test_tool_calls.py | 1 + tests/tool_use/test_tool_choice_required.py | 1 + tests/tool_use/utils.py | 1 + tests/tpu/lora/test_lora.py | 1 + tests/tpu/test_compilation.py | 1 + tests/tpu/test_custom_dispatcher.py | 1 + tests/tpu/test_moe_pallas.py | 1 + tests/tpu/test_quantization_accuracy.py | 1 + tests/tracing/test_tracing.py | 1 + tests/utils.py | 1 + tests/v1/core/test_kv_cache_utils.py | 1 + tests/v1/core/test_prefix_caching.py | 1 + tests/v1/core/test_scheduler.py | 1 + tests/v1/core/test_scheduler_e2e.py | 1 + tests/v1/core/test_specialized_manager.py | 1 + tests/v1/e2e/test_cascade_attention.py | 1 + tests/v1/e2e/test_correctness_sliding_window.py | 1 + tests/v1/e2e/test_spec_decode.py | 1 + tests/v1/engine/conftest.py | 1 + tests/v1/engine/test_async_llm.py | 1 + tests/v1/engine/test_engine_args.py | 1 + tests/v1/engine/test_engine_core.py | 1 + tests/v1/engine/test_engine_core_client.py | 1 + tests/v1/engine/test_llm_engine.py | 1 + tests/v1/engine/test_output_processor.py | 1 + tests/v1/engine/utils.py | 1 + tests/v1/entrypoints/conftest.py | 1 + tests/v1/entrypoints/llm/test_struct_output_generate.py | 1 + tests/v1/entrypoints/openai/test_chat_completion.py | 1 + tests/v1/entrypoints/openai/test_completion.py | 1 + tests/v1/entrypoints/openai/test_multi_api_servers.py | 1 + tests/v1/kv_connector/nixl_integration/test_accuracy.py | 1 + tests/v1/kv_connector/nixl_integration/test_edge_cases.py | 1 + tests/v1/kv_connector/nixl_integration/toy_proxy_server.py | 1 + tests/v1/kv_connector/unit/test_multi_connector.py | 1 + tests/v1/kv_connector/unit/test_nixl_connector.py | 1 + tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py | 1 + tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py | 1 + tests/v1/kv_connector/unit/utils.py | 1 + tests/v1/metrics/test_ray_metrics.py | 1 + tests/v1/sample/test_logprobs.py | 1 + tests/v1/sample/test_logprobs_e2e.py | 1 + tests/v1/sample/test_rejection_sampler.py | 1 + tests/v1/sample/test_sampler.py | 1 + tests/v1/sample/test_sampling_params_e2e.py | 1 + tests/v1/sample/test_topk_topp_sampler.py | 1 + tests/v1/sample/utils.py | 1 + tests/v1/shutdown/test_delete.py | 1 + tests/v1/shutdown/test_forward_error.py | 1 + tests/v1/shutdown/test_processor_error.py | 1 + tests/v1/shutdown/test_startup_error.py | 1 + tests/v1/shutdown/utils.py | 1 + tests/v1/spec_decode/test_eagle.py | 1 + tests/v1/spec_decode/test_max_len.py | 1 + tests/v1/spec_decode/test_ngram.py | 1 + tests/v1/structured_output/test_utils.py | 1 + tests/v1/test_async_llm_dp.py | 1 + tests/v1/test_metrics_reader.py | 1 + tests/v1/test_oracle.py | 1 + tests/v1/test_serial_utils.py | 1 + tests/v1/test_utils.py | 1 + tests/v1/tpu/test_basic.py | 1 + tests/v1/tpu/test_mha_attn.py | 1 + tests/v1/tpu/test_multimodal.py | 1 + tests/v1/tpu/test_pallas.py | 1 + tests/v1/tpu/test_perf.py | 1 + tests/v1/tpu/test_sampler.py | 1 + tests/v1/tpu/test_topk_topp_sampler.py | 1 + tests/v1/tpu/worker/test_tpu_model_runner.py | 1 + tests/v1/worker/test_gpu_input_batch.py | 1 + tests/v1/worker/test_gpu_model_runner.py | 1 + tests/vllm_test_utils/setup.py | 1 + tests/vllm_test_utils/vllm_test_utils/__init__.py | 1 + tests/vllm_test_utils/vllm_test_utils/blame.py | 1 + tests/vllm_test_utils/vllm_test_utils/monitor.py | 1 + tests/weight_loading/test_weight_loading.py | 1 + tests/worker/conftest.py | 1 + tests/worker/test_encoder_decoder_model_runner.py | 1 + tests/worker/test_model_input.py | 1 + tests/worker/test_model_runner.py | 1 + tests/worker/test_profile.py | 1 + tests/worker/test_swap.py | 1 + tools/check_spdx_header.py | 5 ++++- tools/check_triton_import.py | 1 + tools/enforce_regex_import.py | 1 + tools/profiler/print_layerwise_table.py | 1 + tools/profiler/visualize_layerwise_profile.py | 1 + tools/report_build_time_ninja.py | 1 + use_existing_torch.py | 1 + vllm/__init__.py | 1 + vllm/_custom_ops.py | 1 + vllm/_ipex_ops.py | 1 + vllm/adapter_commons/layers.py | 1 + vllm/adapter_commons/models.py | 1 + vllm/adapter_commons/request.py | 1 + vllm/adapter_commons/utils.py | 1 + vllm/adapter_commons/worker_manager.py | 1 + vllm/assets/audio.py | 1 + vllm/assets/base.py | 1 + vllm/assets/image.py | 1 + vllm/assets/video.py | 1 + vllm/attention/__init__.py | 1 + vllm/attention/backends/abstract.py | 1 + vllm/attention/backends/blocksparse_attn.py | 1 + vllm/attention/backends/cpu_mla.py | 1 + vllm/attention/backends/dual_chunk_flash_attn.py | 1 + vllm/attention/backends/flash_attn.py | 1 + vllm/attention/backends/flashinfer.py | 1 + vllm/attention/backends/flashmla.py | 1 + vllm/attention/backends/hpu_attn.py | 1 + vllm/attention/backends/ipex_attn.py | 1 + vllm/attention/backends/mla/common.py | 1 + vllm/attention/backends/pallas.py | 1 + vllm/attention/backends/placeholder_attn.py | 1 + vllm/attention/backends/rocm_aiter_mla.py | 1 + vllm/attention/backends/rocm_flash_attn.py | 1 + vllm/attention/backends/torch_sdpa.py | 1 + vllm/attention/backends/triton_mla.py | 1 + vllm/attention/backends/utils.py | 1 + vllm/attention/backends/xformers.py | 1 + vllm/attention/layer.py | 1 + .../blocksparse_attention/blocksparse_attention_kernel.py | 1 + vllm/attention/ops/blocksparse_attention/interface.py | 1 + vllm/attention/ops/blocksparse_attention/utils.py | 1 + vllm/attention/ops/chunked_prefill_paged_decode.py | 1 + vllm/attention/ops/flashmla.py | 1 + vllm/attention/ops/hpu_paged_attn.py | 1 + vllm/attention/ops/ipex_attn.py | 1 + vllm/attention/ops/merge_attn_states.py | 1 + vllm/attention/ops/nki_flash_attn.py | 1 + vllm/attention/ops/paged_attn.py | 1 + vllm/attention/ops/prefix_prefill.py | 1 + vllm/attention/ops/rocm_aiter_mla.py | 1 + vllm/attention/ops/rocm_aiter_paged_attn.py | 1 + vllm/attention/ops/triton_decode_attention.py | 1 + vllm/attention/ops/triton_flash_attention.py | 1 + vllm/attention/ops/triton_merge_attn_states.py | 1 + vllm/attention/ops/triton_unified_attention.py | 1 + vllm/attention/selector.py | 1 + vllm/attention/utils/fa_utils.py | 1 + vllm/beam_search.py | 1 + vllm/benchmarks/datasets.py | 1 + vllm/benchmarks/endpoint_request_func.py | 1 + vllm/benchmarks/latency.py | 1 + vllm/benchmarks/serve.py | 1 + vllm/benchmarks/throughput.py | 1 + vllm/benchmarks/utils.py | 1 + vllm/collect_env.py | 6 ++++-- vllm/compilation/activation_quant_fusion.py | 1 + vllm/compilation/backends.py | 1 + vllm/compilation/base_piecewise_backend.py | 1 + vllm/compilation/collective_fusion.py | 1 + vllm/compilation/compiler_interface.py | 1 + vllm/compilation/counter.py | 1 + vllm/compilation/cuda_piecewise_backend.py | 1 + vllm/compilation/decorators.py | 1 + vllm/compilation/fix_functionalization.py | 1 + vllm/compilation/fusion.py | 1 + vllm/compilation/fx_utils.py | 1 + vllm/compilation/inductor_pass.py | 1 + vllm/compilation/monitor.py | 1 + vllm/compilation/multi_output_match.py | 1 + vllm/compilation/noop_elimination.py | 1 + vllm/compilation/pass_manager.py | 1 + vllm/compilation/sequence_parallelism.py | 1 + vllm/compilation/torch25_custom_graph_pass.py | 1 + vllm/compilation/vllm_inductor_pass.py | 1 + vllm/compilation/wrapper.py | 1 + vllm/config.py | 1 + vllm/connections.py | 1 + vllm/core/block/block_table.py | 1 + vllm/core/block/common.py | 1 + vllm/core/block/cpu_gpu_block_allocator.py | 1 + vllm/core/block/interfaces.py | 1 + vllm/core/block/naive_block.py | 1 + vllm/core/block/prefix_caching_block.py | 1 + vllm/core/block/utils.py | 1 + vllm/core/block_manager.py | 1 + vllm/core/evictor.py | 1 + vllm/core/interfaces.py | 1 + vllm/core/placeholder_block_space_manager.py | 1 + vllm/core/scheduler.py | 1 + vllm/device_allocator/cumem.py | 1 + vllm/distributed/__init__.py | 1 + vllm/distributed/communication_op.py | 1 + vllm/distributed/device_communicators/all2all.py | 1 + .../device_communicators/base_device_communicator.py | 1 + vllm/distributed/device_communicators/cpu_communicator.py | 1 + vllm/distributed/device_communicators/cuda_communicator.py | 1 + vllm/distributed/device_communicators/cuda_wrapper.py | 1 + vllm/distributed/device_communicators/custom_all_reduce.py | 1 + .../device_communicators/custom_all_reduce_utils.py | 1 + vllm/distributed/device_communicators/hpu_communicator.py | 1 + .../distributed/device_communicators/neuron_communicator.py | 1 + vllm/distributed/device_communicators/pynccl.py | 1 + vllm/distributed/device_communicators/pynccl_wrapper.py | 1 + vllm/distributed/device_communicators/shm_broadcast.py | 1 + vllm/distributed/device_communicators/tpu_communicator.py | 1 + vllm/distributed/device_communicators/xpu_communicator.py | 1 + vllm/distributed/kv_events.py | 1 + vllm/distributed/kv_transfer/__init__.py | 1 + vllm/distributed/kv_transfer/kv_connector/base.py | 1 + vllm/distributed/kv_transfer/kv_connector/factory.py | 1 + .../kv_transfer/kv_connector/lmcache_connector.py | 1 + .../kv_transfer/kv_connector/mooncake_store_connector.py | 1 + .../kv_transfer/kv_connector/simple_connector.py | 1 + vllm/distributed/kv_transfer/kv_connector/utils.py | 1 + vllm/distributed/kv_transfer/kv_connector/v1/__init__.py | 1 + vllm/distributed/kv_transfer/kv_connector/v1/base.py | 1 + .../kv_transfer/kv_connector/v1/lmcache_connector.py | 1 + .../kv_transfer/kv_connector/v1/multi_connector.py | 1 + .../kv_transfer/kv_connector/v1/nixl_connector.py | 1 + .../kv_transfer/kv_connector/v1/shared_storage_connector.py | 1 + vllm/distributed/kv_transfer/kv_connector_agent.py | 1 + vllm/distributed/kv_transfer/kv_lookup_buffer/base.py | 1 + .../kv_transfer/kv_lookup_buffer/mooncake_store.py | 1 + .../kv_transfer/kv_lookup_buffer/simple_buffer.py | 1 + vllm/distributed/kv_transfer/kv_pipe/base.py | 1 + vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py | 1 + vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py | 1 + vllm/distributed/kv_transfer/kv_transfer_state.py | 1 + vllm/distributed/parallel_state.py | 1 + vllm/distributed/utils.py | 1 + vllm/engine/arg_utils.py | 1 + vllm/engine/async_llm_engine.py | 1 + vllm/engine/async_timeout.py | 1 + vllm/engine/llm_engine.py | 1 + vllm/engine/metrics.py | 1 + vllm/engine/metrics_types.py | 1 + vllm/engine/multiprocessing/__init__.py | 1 + vllm/engine/multiprocessing/client.py | 1 + vllm/engine/multiprocessing/engine.py | 1 + vllm/engine/output_processor/interfaces.py | 1 + vllm/engine/output_processor/multi_step.py | 1 + vllm/engine/output_processor/single_step.py | 1 + vllm/engine/output_processor/stop_checker.py | 1 + vllm/engine/output_processor/util.py | 1 + vllm/engine/protocol.py | 1 + vllm/entrypoints/api_server.py | 1 + vllm/entrypoints/chat_utils.py | 1 + vllm/entrypoints/cli/benchmark/base.py | 1 + vllm/entrypoints/cli/benchmark/latency.py | 1 + vllm/entrypoints/cli/benchmark/main.py | 1 + vllm/entrypoints/cli/benchmark/serve.py | 1 + vllm/entrypoints/cli/benchmark/throughput.py | 1 + vllm/entrypoints/cli/collect_env.py | 1 + vllm/entrypoints/cli/main.py | 1 + vllm/entrypoints/cli/openai.py | 1 + vllm/entrypoints/cli/run_batch.py | 1 + vllm/entrypoints/cli/serve.py | 1 + vllm/entrypoints/cli/types.py | 1 + vllm/entrypoints/launcher.py | 1 + vllm/entrypoints/llm.py | 1 + vllm/entrypoints/logger.py | 1 + vllm/entrypoints/openai/api_server.py | 1 + vllm/entrypoints/openai/cli_args.py | 1 + vllm/entrypoints/openai/logits_processors.py | 1 + vllm/entrypoints/openai/protocol.py | 1 + vllm/entrypoints/openai/run_batch.py | 1 + vllm/entrypoints/openai/serving_chat.py | 1 + vllm/entrypoints/openai/serving_classification.py | 1 + vllm/entrypoints/openai/serving_completion.py | 1 + vllm/entrypoints/openai/serving_embedding.py | 1 + vllm/entrypoints/openai/serving_engine.py | 1 + vllm/entrypoints/openai/serving_models.py | 1 + vllm/entrypoints/openai/serving_pooling.py | 1 + vllm/entrypoints/openai/serving_score.py | 1 + vllm/entrypoints/openai/serving_tokenization.py | 1 + vllm/entrypoints/openai/serving_transcription.py | 1 + vllm/entrypoints/openai/tool_parsers/__init__.py | 1 + .../entrypoints/openai/tool_parsers/abstract_tool_parser.py | 1 + .../openai/tool_parsers/deepseekv3_tool_parser.py | 1 + .../openai/tool_parsers/granite_20b_fc_tool_parser.py | 1 + vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py | 1 + vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py | 1 + .../openai/tool_parsers/internlm2_tool_parser.py | 1 + vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py | 1 + .../openai/tool_parsers/llama4_pythonic_tool_parser.py | 1 + vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py | 1 + vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py | 1 + .../entrypoints/openai/tool_parsers/phi4mini_tool_parser.py | 1 + .../entrypoints/openai/tool_parsers/pythonic_tool_parser.py | 1 + vllm/entrypoints/openai/tool_parsers/utils.py | 1 + vllm/entrypoints/score_utils.py | 1 + vllm/entrypoints/ssl.py | 1 + vllm/entrypoints/utils.py | 1 + vllm/env_override.py | 1 + vllm/envs.py | 1 + vllm/executor/executor_base.py | 1 + vllm/executor/mp_distributed_executor.py | 1 + vllm/executor/msgspec_utils.py | 1 + vllm/executor/multiproc_worker_utils.py | 1 + vllm/executor/ray_distributed_executor.py | 1 + vllm/executor/ray_utils.py | 1 + vllm/executor/uniproc_executor.py | 1 + vllm/forward_context.py | 1 + vllm/inputs/__init__.py | 1 + vllm/inputs/data.py | 1 + vllm/inputs/parse.py | 1 + vllm/inputs/preprocess.py | 1 + vllm/inputs/registry.py | 1 + vllm/jsontree.py | 1 + vllm/logger.py | 1 + vllm/logging_utils/__init__.py | 1 + vllm/logging_utils/dump_input.py | 1 + vllm/logging_utils/formatter.py | 1 + vllm/logits_process.py | 1 + vllm/lora/fully_sharded_layers.py | 1 + vllm/lora/layers.py | 1 + vllm/lora/lora.py | 1 + vllm/lora/models.py | 1 + vllm/lora/ops/torch_ops/__init__.py | 1 + vllm/lora/ops/torch_ops/lora_ops.py | 1 + vllm/lora/ops/triton_ops/__init__.py | 1 + vllm/lora/ops/triton_ops/kernel_utils.py | 1 + vllm/lora/ops/triton_ops/lora_expand_op.py | 1 + vllm/lora/ops/triton_ops/lora_kernel_metadata.py | 1 + vllm/lora/ops/triton_ops/lora_shrink_op.py | 1 + vllm/lora/ops/triton_ops/utils.py | 1 + vllm/lora/ops/xla_ops/__init__.py | 1 + vllm/lora/ops/xla_ops/lora_ops.py | 1 + vllm/lora/peft_helper.py | 1 + vllm/lora/punica_wrapper/__init__.py | 1 + vllm/lora/punica_wrapper/punica_base.py | 1 + vllm/lora/punica_wrapper/punica_cpu.py | 1 + vllm/lora/punica_wrapper/punica_gpu.py | 1 + vllm/lora/punica_wrapper/punica_hpu.py | 1 + vllm/lora/punica_wrapper/punica_selector.py | 1 + vllm/lora/punica_wrapper/punica_tpu.py | 1 + vllm/lora/punica_wrapper/utils.py | 1 + vllm/lora/request.py | 1 + vllm/lora/resolver.py | 1 + vllm/lora/utils.py | 1 + vllm/lora/worker_manager.py | 1 + vllm/model_executor/__init__.py | 1 + vllm/model_executor/custom_op.py | 1 + vllm/model_executor/guided_decoding/__init__.py | 1 + vllm/model_executor/guided_decoding/guidance_decoding.py | 1 + .../guided_decoding/guidance_logits_processors.py | 1 + vllm/model_executor/guided_decoding/guided_fields.py | 1 + .../guided_decoding/lm_format_enforcer_decoding.py | 1 + vllm/model_executor/guided_decoding/outlines_decoding.py | 1 + .../guided_decoding/outlines_logits_processors.py | 1 + vllm/model_executor/guided_decoding/utils.py | 1 + vllm/model_executor/guided_decoding/xgrammar_decoding.py | 1 + vllm/model_executor/layers/activation.py | 1 + vllm/model_executor/layers/fused_moe/__init__.py | 1 + vllm/model_executor/layers/fused_moe/cutlass_moe.py | 1 + vllm/model_executor/layers/fused_moe/deep_gemm_moe.py | 1 + vllm/model_executor/layers/fused_moe/fused_batched_moe.py | 1 + vllm/model_executor/layers/fused_moe/fused_marlin_moe.py | 1 + vllm/model_executor/layers/fused_moe/fused_moe.py | 1 + vllm/model_executor/layers/fused_moe/layer.py | 1 + vllm/model_executor/layers/fused_moe/modular_kernel.py | 1 + .../model_executor/layers/fused_moe/moe_align_block_size.py | 1 + vllm/model_executor/layers/fused_moe/moe_pallas.py | 1 + .../layers/fused_moe/moe_permute_unpermute.py | 1 + vllm/model_executor/layers/fused_moe/moe_torch_iterative.py | 1 + .../layers/fused_moe/pplx_prepare_finalize.py | 1 + vllm/model_executor/layers/fused_moe/prepare_finalize.py | 1 + .../model_executor/layers/fused_moe/rocm_aiter_fused_moe.py | 1 + .../model_executor/layers/fused_moe/triton_deep_gemm_moe.py | 1 + vllm/model_executor/layers/fused_moe/utils.py | 1 + vllm/model_executor/layers/layernorm.py | 1 + vllm/model_executor/layers/lightning_attn.py | 1 + vllm/model_executor/layers/linear.py | 1 + vllm/model_executor/layers/logits_processor.py | 1 + vllm/model_executor/layers/mamba/mamba2_metadata.py | 1 + vllm/model_executor/layers/mamba/mamba_mixer.py | 1 + vllm/model_executor/layers/mamba/mamba_mixer2.py | 1 + vllm/model_executor/layers/mamba/ops/causal_conv1d.py | 1 + vllm/model_executor/layers/mamba/ops/mamba_ssm.py | 1 + vllm/model_executor/layers/mamba/ops/ssd_bmm.py | 1 + vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py | 1 + vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py | 1 + vllm/model_executor/layers/mamba/ops/ssd_combined.py | 1 + vllm/model_executor/layers/mamba/ops/ssd_state_passing.py | 1 + vllm/model_executor/layers/pooler.py | 1 + vllm/model_executor/layers/quantization/__init__.py | 1 + vllm/model_executor/layers/quantization/aqlm.py | 1 + vllm/model_executor/layers/quantization/auto_round.py | 1 + vllm/model_executor/layers/quantization/awq.py | 1 + vllm/model_executor/layers/quantization/awq_marlin.py | 1 + vllm/model_executor/layers/quantization/awq_triton.py | 1 + vllm/model_executor/layers/quantization/base_config.py | 1 + vllm/model_executor/layers/quantization/bitblas.py | 1 + vllm/model_executor/layers/quantization/bitsandbytes.py | 1 + .../quantization/compressed_tensors/compressed_tensors.py | 1 + .../compressed_tensors/compressed_tensors_moe.py | 1 + .../quantization/compressed_tensors/schemes/__init__.py | 1 + .../compressed_tensors/schemes/compressed_tensors_24.py | 1 + .../compressed_tensors/schemes/compressed_tensors_scheme.py | 1 + .../schemes/compressed_tensors_w4a16_24.py | 1 + .../schemes/compressed_tensors_w4a16_nvfp4.py | 1 + .../schemes/compressed_tensors_w8a16_fp8.py | 1 + .../schemes/compressed_tensors_w8a8_fp8.py | 1 + .../schemes/compressed_tensors_w8a8_int8.py | 1 + .../compressed_tensors/schemes/compressed_tensors_wNa16.py | 1 + .../quantization/compressed_tensors/triton_scaled_mm.py | 1 + .../layers/quantization/compressed_tensors/utils.py | 1 + vllm/model_executor/layers/quantization/deepspeedfp.py | 1 + vllm/model_executor/layers/quantization/experts_int8.py | 1 + vllm/model_executor/layers/quantization/fbgemm_fp8.py | 1 + vllm/model_executor/layers/quantization/fp8.py | 1 + vllm/model_executor/layers/quantization/gguf.py | 1 + vllm/model_executor/layers/quantization/gptq.py | 1 + vllm/model_executor/layers/quantization/gptq_bitblas.py | 1 + vllm/model_executor/layers/quantization/gptq_marlin.py | 1 + vllm/model_executor/layers/quantization/gptq_marlin_24.py | 1 + vllm/model_executor/layers/quantization/hqq_marlin.py | 1 + vllm/model_executor/layers/quantization/ipex_quant.py | 1 + .../quantization/kernels/mixed_precision/MPLinearKernel.py | 1 + .../layers/quantization/kernels/mixed_precision/__init__.py | 1 + .../layers/quantization/kernels/mixed_precision/allspark.py | 1 + .../layers/quantization/kernels/mixed_precision/bitblas.py | 1 + .../layers/quantization/kernels/mixed_precision/exllama.py | 1 + .../layers/quantization/kernels/mixed_precision/machete.py | 1 + .../layers/quantization/kernels/mixed_precision/marlin.py | 1 + .../quantization/kernels/scaled_mm/ScaledMMLinearKernel.py | 1 + .../layers/quantization/kernels/scaled_mm/__init__.py | 1 + .../layers/quantization/kernels/scaled_mm/aiter.py | 1 + .../layers/quantization/kernels/scaled_mm/cutlass.py | 1 + .../layers/quantization/kernels/scaled_mm/triton.py | 1 + .../layers/quantization/kernels/scaled_mm/xla.py | 1 + vllm/model_executor/layers/quantization/kv_cache.py | 1 + vllm/model_executor/layers/quantization/marlin.py | 1 + vllm/model_executor/layers/quantization/modelopt.py | 1 + vllm/model_executor/layers/quantization/moe_wna16.py | 1 + vllm/model_executor/layers/quantization/neuron_quant.py | 1 + vllm/model_executor/layers/quantization/ptpc_fp8.py | 1 + vllm/model_executor/layers/quantization/qqq.py | 1 + vllm/model_executor/layers/quantization/quark/quark.py | 1 + vllm/model_executor/layers/quantization/quark/quark_moe.py | 1 + .../layers/quantization/quark/schemes/__init__.py | 1 + .../layers/quantization/quark/schemes/quark_scheme.py | 1 + .../layers/quantization/quark/schemes/quark_w4a4_mxfp4.py | 1 + .../layers/quantization/quark/schemes/quark_w8a8_fp8.py | 1 + .../layers/quantization/quark/schemes/quark_w8a8_int8.py | 1 + vllm/model_executor/layers/quantization/quark/utils.py | 1 + vllm/model_executor/layers/quantization/schema.py | 1 + vllm/model_executor/layers/quantization/torchao.py | 1 + vllm/model_executor/layers/quantization/tpu_int8.py | 1 + vllm/model_executor/layers/quantization/utils/__init__.py | 1 + .../layers/quantization/utils/allspark_utils.py | 1 + .../layers/quantization/utils/bitblas_utils.py | 1 + vllm/model_executor/layers/quantization/utils/fp8_utils.py | 1 + vllm/model_executor/layers/quantization/utils/gptq_utils.py | 1 + vllm/model_executor/layers/quantization/utils/int8_utils.py | 1 + .../model_executor/layers/quantization/utils/layer_utils.py | 1 + .../layers/quantization/utils/machete_utils.py | 1 + .../layers/quantization/utils/marlin_utils.py | 1 + .../layers/quantization/utils/marlin_utils_fp4.py | 1 + .../layers/quantization/utils/marlin_utils_fp8.py | 1 + .../layers/quantization/utils/marlin_utils_test.py | 1 + .../layers/quantization/utils/marlin_utils_test_24.py | 1 + .../layers/quantization/utils/marlin_utils_test_qqq.py | 1 + .../model_executor/layers/quantization/utils/mxfp4_utils.py | 1 + .../layers/quantization/utils/nvfp4_emulation_utils.py | 1 + .../model_executor/layers/quantization/utils/quant_utils.py | 1 + vllm/model_executor/layers/quantization/utils/w8a8_utils.py | 1 + vllm/model_executor/layers/rejection_sampler.py | 1 + vllm/model_executor/layers/resampler.py | 1 + vllm/model_executor/layers/rotary_embedding.py | 1 + vllm/model_executor/layers/sampler.py | 1 + vllm/model_executor/layers/spec_decode_base_sampler.py | 1 + vllm/model_executor/layers/typical_acceptance_sampler.py | 1 + vllm/model_executor/layers/utils.py | 1 + vllm/model_executor/layers/vocab_parallel_embedding.py | 1 + vllm/model_executor/model_loader/__init__.py | 1 + vllm/model_executor/model_loader/base_loader.py | 1 + vllm/model_executor/model_loader/bitsandbytes_loader.py | 1 + vllm/model_executor/model_loader/default_loader.py | 1 + vllm/model_executor/model_loader/dummy_loader.py | 1 + vllm/model_executor/model_loader/gguf_loader.py | 1 + vllm/model_executor/model_loader/neuron.py | 1 + vllm/model_executor/model_loader/neuronx_distributed.py | 1 + vllm/model_executor/model_loader/runai_streamer_loader.py | 1 + vllm/model_executor/model_loader/sharded_state_loader.py | 1 + vllm/model_executor/model_loader/tensorizer.py | 1 + vllm/model_executor/model_loader/tensorizer_loader.py | 1 + vllm/model_executor/model_loader/utils.py | 1 + vllm/model_executor/model_loader/weight_utils.py | 1 + vllm/model_executor/models/__init__.py | 1 + vllm/model_executor/models/adapters.py | 1 + vllm/model_executor/models/aimv2.py | 1 + vllm/model_executor/models/arctic.py | 1 + vllm/model_executor/models/aria.py | 1 + vllm/model_executor/models/aya_vision.py | 3 ++- vllm/model_executor/models/baichuan.py | 1 + vllm/model_executor/models/bamba.py | 1 + vllm/model_executor/models/bart.py | 1 + vllm/model_executor/models/bert.py | 1 + vllm/model_executor/models/bert_with_rope.py | 1 + vllm/model_executor/models/blip.py | 1 + vllm/model_executor/models/blip2.py | 1 + vllm/model_executor/models/bloom.py | 1 + vllm/model_executor/models/chameleon.py | 1 + vllm/model_executor/models/chatglm.py | 1 + vllm/model_executor/models/clip.py | 1 + vllm/model_executor/models/commandr.py | 1 + vllm/model_executor/models/constant_size_cache.py | 1 + vllm/model_executor/models/dbrx.py | 1 + vllm/model_executor/models/deepseek.py | 1 + vllm/model_executor/models/deepseek_mtp.py | 1 + vllm/model_executor/models/deepseek_v2.py | 1 + vllm/model_executor/models/deepseek_vl2.py | 1 + vllm/model_executor/models/eagle.py | 1 + vllm/model_executor/models/exaone.py | 1 + vllm/model_executor/models/fairseq2_llama.py | 1 + vllm/model_executor/models/falcon.py | 1 + vllm/model_executor/models/falcon_h1.py | 1 + vllm/model_executor/models/florence2.py | 1 + vllm/model_executor/models/fuyu.py | 1 + vllm/model_executor/models/gemma.py | 1 + vllm/model_executor/models/gemma2.py | 1 + vllm/model_executor/models/gemma3.py | 1 + vllm/model_executor/models/gemma3_mm.py | 1 + vllm/model_executor/models/glm.py | 1 + vllm/model_executor/models/glm4.py | 1 + vllm/model_executor/models/glm4v.py | 1 + vllm/model_executor/models/gpt2.py | 1 + vllm/model_executor/models/gpt_bigcode.py | 1 + vllm/model_executor/models/gpt_j.py | 1 + vllm/model_executor/models/gpt_neox.py | 1 + vllm/model_executor/models/granite.py | 1 + vllm/model_executor/models/granite_speech.py | 1 + vllm/model_executor/models/granitemoe.py | 1 + vllm/model_executor/models/granitemoehybrid.py | 1 + vllm/model_executor/models/granitemoeshared.py | 1 + vllm/model_executor/models/gritlm.py | 1 + vllm/model_executor/models/grok1.py | 1 + vllm/model_executor/models/h2ovl.py | 1 + vllm/model_executor/models/idefics2_vision_model.py | 1 + vllm/model_executor/models/idefics3.py | 1 + vllm/model_executor/models/interfaces.py | 1 + vllm/model_executor/models/interfaces_base.py | 1 + vllm/model_executor/models/intern_vit.py | 1 + vllm/model_executor/models/internlm2.py | 1 + vllm/model_executor/models/internlm2_ve.py | 1 + vllm/model_executor/models/internvl.py | 1 + vllm/model_executor/models/jais.py | 1 + vllm/model_executor/models/jamba.py | 1 + vllm/model_executor/models/kimi_vl.py | 1 + vllm/model_executor/models/llama.py | 1 + vllm/model_executor/models/llama4.py | 1 + vllm/model_executor/models/llama_eagle.py | 1 + vllm/model_executor/models/llama_eagle3.py | 1 + vllm/model_executor/models/llava.py | 1 + vllm/model_executor/models/llava_next.py | 1 + vllm/model_executor/models/llava_next_video.py | 1 + vllm/model_executor/models/llava_onevision.py | 1 + vllm/model_executor/models/mamba.py | 1 + vllm/model_executor/models/mamba2.py | 1 + vllm/model_executor/models/mamba_cache.py | 1 + vllm/model_executor/models/medusa.py | 1 + vllm/model_executor/models/mimo.py | 1 + vllm/model_executor/models/mimo_mtp.py | 1 + vllm/model_executor/models/minicpm.py | 1 + vllm/model_executor/models/minicpm3.py | 1 + vllm/model_executor/models/minicpm_eagle.py | 1 + vllm/model_executor/models/minicpmo.py | 1 + vllm/model_executor/models/minicpmv.py | 1 + vllm/model_executor/models/minimax_cache.py | 1 + vllm/model_executor/models/minimax_text_01.py | 1 + vllm/model_executor/models/minimax_vl_01.py | 1 + vllm/model_executor/models/mistral3.py | 1 + vllm/model_executor/models/mixtral.py | 1 + vllm/model_executor/models/mixtral_quant.py | 1 + vllm/model_executor/models/mllama.py | 1 + vllm/model_executor/models/mllama4.py | 1 + vllm/model_executor/models/mlp_speculator.py | 1 + vllm/model_executor/models/modernbert.py | 1 + vllm/model_executor/models/module_mapping.py | 1 + vllm/model_executor/models/molmo.py | 1 + vllm/model_executor/models/moonvit.py | 1 + vllm/model_executor/models/mpt.py | 1 + vllm/model_executor/models/nemotron.py | 1 + vllm/model_executor/models/nemotron_nas.py | 1 + vllm/model_executor/models/nvlm_d.py | 1 + vllm/model_executor/models/olmo.py | 1 + vllm/model_executor/models/olmo2.py | 1 + vllm/model_executor/models/olmoe.py | 1 + vllm/model_executor/models/opt.py | 1 + vllm/model_executor/models/orion.py | 1 + vllm/model_executor/models/ovis.py | 1 + vllm/model_executor/models/paligemma.py | 1 + vllm/model_executor/models/persimmon.py | 1 + vllm/model_executor/models/phi.py | 1 + vllm/model_executor/models/phi3.py | 1 + vllm/model_executor/models/phi3_small.py | 1 + vllm/model_executor/models/phi3v.py | 1 + vllm/model_executor/models/phi4mm.py | 1 + vllm/model_executor/models/phi4mm_audio.py | 1 + vllm/model_executor/models/phi4mm_utils.py | 1 + vllm/model_executor/models/phimoe.py | 1 + vllm/model_executor/models/pixtral.py | 1 + vllm/model_executor/models/plamo2.py | 1 + vllm/model_executor/models/prithvi_geospatial_mae.py | 1 + vllm/model_executor/models/qwen.py | 1 + vllm/model_executor/models/qwen2.py | 1 + vllm/model_executor/models/qwen2_5_omni_thinker.py | 1 + vllm/model_executor/models/qwen2_5_vl.py | 1 + vllm/model_executor/models/qwen2_audio.py | 1 + vllm/model_executor/models/qwen2_moe.py | 1 + vllm/model_executor/models/qwen2_rm.py | 1 + vllm/model_executor/models/qwen2_vl.py | 1 + vllm/model_executor/models/qwen3.py | 1 + vllm/model_executor/models/qwen3_moe.py | 1 + vllm/model_executor/models/qwen_vl.py | 1 + vllm/model_executor/models/registry.py | 1 + vllm/model_executor/models/roberta.py | 1 + vllm/model_executor/models/siglip.py | 1 + vllm/model_executor/models/skyworkr1v.py | 1 + vllm/model_executor/models/smolvlm.py | 1 + vllm/model_executor/models/solar.py | 1 + vllm/model_executor/models/stablelm.py | 1 + vllm/model_executor/models/starcoder2.py | 1 + vllm/model_executor/models/telechat2.py | 1 + vllm/model_executor/models/teleflm.py | 1 + vllm/model_executor/models/transformers.py | 1 + vllm/model_executor/models/ultravox.py | 1 + vllm/model_executor/models/utils.py | 1 + vllm/model_executor/models/vision.py | 1 + vllm/model_executor/models/whisper.py | 1 + vllm/model_executor/models/zamba2.py | 1 + vllm/model_executor/parameter.py | 1 + vllm/model_executor/pooling_metadata.py | 1 + vllm/model_executor/sampling_metadata.py | 1 + vllm/model_executor/utils.py | 1 + vllm/multimodal/__init__.py | 1 + vllm/multimodal/audio.py | 1 + vllm/multimodal/base.py | 1 + vllm/multimodal/hasher.py | 1 + vllm/multimodal/image.py | 1 + vllm/multimodal/inputs.py | 1 + vllm/multimodal/parse.py | 1 + vllm/multimodal/processing.py | 1 + vllm/multimodal/profiling.py | 1 + vllm/multimodal/registry.py | 1 + vllm/multimodal/utils.py | 1 + vllm/multimodal/video.py | 1 + vllm/outputs.py | 1 + vllm/platforms/__init__.py | 1 + vllm/platforms/cpu.py | 1 + vllm/platforms/cuda.py | 1 + vllm/platforms/hpu.py | 1 + vllm/platforms/interface.py | 1 + vllm/platforms/neuron.py | 1 + vllm/platforms/rocm.py | 1 + vllm/platforms/tpu.py | 1 + vllm/platforms/xpu.py | 1 + vllm/plugins/__init__.py | 1 + vllm/plugins/lora_resolvers/filesystem_resolver.py | 1 + vllm/pooling_params.py | 1 + vllm/profiler/layerwise_profile.py | 1 + vllm/profiler/utils.py | 1 + vllm/prompt_adapter/layers.py | 1 + vllm/prompt_adapter/models.py | 1 + vllm/prompt_adapter/request.py | 1 + vllm/prompt_adapter/utils.py | 1 + vllm/prompt_adapter/worker_manager.py | 1 + vllm/reasoning/__init__.py | 1 + vllm/reasoning/abs_reasoning_parsers.py | 1 + vllm/reasoning/deepseek_r1_reasoning_parser.py | 1 + vllm/reasoning/granite_reasoning_parser.py | 1 + vllm/reasoning/qwen3_reasoning_parser.py | 1 + vllm/sampling_params.py | 1 + vllm/scalar_type.py | 1 + vllm/scripts.py | 1 + vllm/sequence.py | 1 + vllm/spec_decode/batch_expansion.py | 1 + vllm/spec_decode/draft_model_runner.py | 1 + vllm/spec_decode/interfaces.py | 1 + vllm/spec_decode/medusa_worker.py | 1 + vllm/spec_decode/metrics.py | 1 + vllm/spec_decode/mlp_speculator_worker.py | 1 + vllm/spec_decode/mqa_scorer.py | 1 + vllm/spec_decode/multi_step_worker.py | 1 + vllm/spec_decode/ngram_worker.py | 1 + vllm/spec_decode/proposer_worker_base.py | 1 + vllm/spec_decode/smaller_tp_proposer_worker.py | 1 + vllm/spec_decode/spec_decode_worker.py | 1 + vllm/spec_decode/target_model_runner.py | 1 + vllm/spec_decode/top1_proposer.py | 1 + vllm/spec_decode/util.py | 1 + vllm/test_utils.py | 1 + vllm/third_party/pynvml.py | 1 + vllm/tracing.py | 1 + vllm/transformers_utils/__init__.py | 1 + vllm/transformers_utils/chat_templates/__init__.py | 1 + vllm/transformers_utils/chat_templates/registry.py | 1 + vllm/transformers_utils/config.py | 1 + vllm/transformers_utils/configs/__init__.py | 1 + vllm/transformers_utils/configs/arctic.py | 1 + vllm/transformers_utils/configs/chatglm.py | 1 + vllm/transformers_utils/configs/cohere2.py | 1 + vllm/transformers_utils/configs/dbrx.py | 1 + vllm/transformers_utils/configs/deepseek_vl2.py | 1 + vllm/transformers_utils/configs/eagle.py | 1 + vllm/transformers_utils/configs/exaone.py | 1 + vllm/transformers_utils/configs/falcon.py | 1 + vllm/transformers_utils/configs/h2ovl.py | 1 + vllm/transformers_utils/configs/internvl.py | 1 + vllm/transformers_utils/configs/jais.py | 1 + vllm/transformers_utils/configs/kimi_vl.py | 1 + vllm/transformers_utils/configs/medusa.py | 1 + vllm/transformers_utils/configs/minimax_text_01.py | 1 + vllm/transformers_utils/configs/minimax_vl_01.py | 1 + vllm/transformers_utils/configs/mllama.py | 1 + vllm/transformers_utils/configs/mlp_speculator.py | 1 + vllm/transformers_utils/configs/moonvit.py | 1 + vllm/transformers_utils/configs/mpt.py | 1 + vllm/transformers_utils/configs/nemotron.py | 1 + vllm/transformers_utils/configs/nvlm_d.py | 1 + vllm/transformers_utils/configs/ovis.py | 1 + vllm/transformers_utils/configs/skyworkr1v.py | 1 + vllm/transformers_utils/configs/solar.py | 1 + vllm/transformers_utils/configs/telechat2.py | 1 + vllm/transformers_utils/configs/ultravox.py | 1 + vllm/transformers_utils/detokenizer.py | 1 + vllm/transformers_utils/detokenizer_utils.py | 1 + vllm/transformers_utils/processor.py | 1 + vllm/transformers_utils/processors/__init__.py | 1 + vllm/transformers_utils/processors/deepseek_vl2.py | 1 + vllm/transformers_utils/processors/ovis.py | 1 + vllm/transformers_utils/s3_utils.py | 1 + vllm/transformers_utils/tokenizer.py | 1 + vllm/transformers_utils/tokenizer_base.py | 1 + vllm/transformers_utils/tokenizer_group.py | 1 + vllm/transformers_utils/tokenizers/__init__.py | 1 + vllm/transformers_utils/tokenizers/mistral.py | 1 + vllm/transformers_utils/utils.py | 1 + vllm/triton_utils/__init__.py | 1 + vllm/triton_utils/importing.py | 1 + vllm/usage/usage_lib.py | 1 + vllm/utils.py | 1 + vllm/v1/attention/backends/flash_attn.py | 1 + vllm/v1/attention/backends/flashinfer.py | 1 + vllm/v1/attention/backends/mla/common.py | 1 + vllm/v1/attention/backends/mla/flashmla.py | 1 + vllm/v1/attention/backends/mla/rocm_aiter_mla.py | 1 + vllm/v1/attention/backends/mla/triton_mla.py | 1 + vllm/v1/attention/backends/pallas.py | 1 + vllm/v1/attention/backends/triton_attn.py | 1 + vllm/v1/attention/backends/utils.py | 1 + vllm/v1/core/block_pool.py | 1 + vllm/v1/core/encoder_cache_manager.py | 1 + vllm/v1/core/kv_cache_manager.py | 1 + vllm/v1/core/kv_cache_utils.py | 1 + vllm/v1/core/sched/interface.py | 1 + vllm/v1/core/sched/output.py | 1 + vllm/v1/core/sched/scheduler.py | 1 + vllm/v1/core/sched/utils.py | 1 + vllm/v1/core/single_type_kv_cache_manager.py | 1 + vllm/v1/engine/__init__.py | 1 + vllm/v1/engine/async_llm.py | 1 + vllm/v1/engine/coordinator.py | 1 + vllm/v1/engine/core.py | 1 + vllm/v1/engine/core_client.py | 1 + vllm/v1/engine/detokenizer.py | 1 + vllm/v1/engine/exceptions.py | 1 + vllm/v1/engine/llm_engine.py | 1 + vllm/v1/engine/logprobs.py | 1 + vllm/v1/engine/mm_input_cache.py | 1 + vllm/v1/engine/output_processor.py | 1 + vllm/v1/engine/parallel_sampling.py | 1 + vllm/v1/engine/processor.py | 1 + vllm/v1/executor/abstract.py | 1 + vllm/v1/executor/multiproc_executor.py | 1 + vllm/v1/executor/ray_distributed_executor.py | 1 + vllm/v1/kv_cache_interface.py | 1 + vllm/v1/metrics/loggers.py | 1 + vllm/v1/metrics/prometheus.py | 1 + vllm/v1/metrics/ray_wrappers.py | 1 + vllm/v1/metrics/reader.py | 1 + vllm/v1/metrics/stats.py | 1 + vllm/v1/outputs.py | 1 + vllm/v1/request.py | 1 + vllm/v1/sample/metadata.py | 1 + vllm/v1/sample/ops/bad_words.py | 1 + vllm/v1/sample/ops/penalties.py | 1 + vllm/v1/sample/ops/topk_topp_sampler.py | 1 + vllm/v1/sample/rejection_sampler.py | 1 + vllm/v1/sample/sampler.py | 1 + vllm/v1/sample/tpu/metadata.py | 1 + vllm/v1/sample/tpu/sampler.py | 1 + vllm/v1/serial_utils.py | 1 + vllm/v1/spec_decode/eagle.py | 1 + vllm/v1/spec_decode/medusa.py | 1 + vllm/v1/spec_decode/metadata.py | 1 + vllm/v1/spec_decode/metrics.py | 1 + vllm/v1/spec_decode/ngram_proposer.py | 1 + vllm/v1/spec_decode/utils.py | 1 + vllm/v1/structured_output/__init__.py | 1 + vllm/v1/structured_output/backend_guidance.py | 1 + vllm/v1/structured_output/backend_types.py | 1 + vllm/v1/structured_output/backend_xgrammar.py | 1 + vllm/v1/structured_output/request.py | 1 + vllm/v1/structured_output/utils.py | 1 + vllm/v1/utils.py | 1 + vllm/v1/worker/block_table.py | 1 + vllm/v1/worker/gpu_input_batch.py | 1 + vllm/v1/worker/gpu_model_runner.py | 1 + vllm/v1/worker/gpu_worker.py | 1 + vllm/v1/worker/lora_model_runner_mixin.py | 1 + vllm/v1/worker/tpu_model_runner.py | 1 + vllm/v1/worker/tpu_worker.py | 1 + vllm/v1/worker/utils.py | 1 + vllm/v1/worker/worker_base.py | 1 + vllm/version.py | 1 + vllm/worker/cache_engine.py | 1 + vllm/worker/cpu_enc_dec_model_runner.py | 1 + vllm/worker/cpu_model_runner.py | 1 + vllm/worker/cpu_pooling_model_runner.py | 1 + vllm/worker/cpu_worker.py | 1 + vllm/worker/enc_dec_model_runner.py | 1 + vllm/worker/hpu_model_runner.py | 1 + vllm/worker/hpu_worker.py | 1 + vllm/worker/model_runner.py | 1 + vllm/worker/model_runner_base.py | 1 + vllm/worker/multi_step_hpu_worker.py | 1 + vllm/worker/multi_step_model_runner.py | 1 + vllm/worker/multi_step_neuron_model_runner.py | 1 + vllm/worker/multi_step_neuronx_distributed_model_runner.py | 1 + vllm/worker/multi_step_tpu_worker.py | 1 + vllm/worker/multi_step_worker.py | 1 + vllm/worker/neuron_model_runner.py | 1 + vllm/worker/neuron_worker.py | 1 + vllm/worker/neuronx_distributed_model_runner.py | 1 + vllm/worker/pooling_model_runner.py | 1 + vllm/worker/tpu_model_runner.py | 1 + vllm/worker/tpu_worker.py | 1 + vllm/worker/utils.py | 1 + vllm/worker/worker.py | 1 + vllm/worker/worker_base.py | 1 + vllm/worker/xpu_model_runner.py | 1 + vllm/worker/xpu_worker.py | 1 + 1432 files changed, 1441 insertions(+), 6 deletions(-) diff --git a/.buildkite/check-wheel-size.py b/.buildkite/check-wheel-size.py index e29881fcbac01..68aff793ae6aa 100644 --- a/.buildkite/check-wheel-size.py +++ b/.buildkite/check-wheel-size.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os import sys diff --git a/.buildkite/generate_index.py b/.buildkite/generate_index.py index 270663c415c72..7045d8810493e 100644 --- a/.buildkite/generate_index.py +++ b/.buildkite/generate_index.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import os diff --git a/.buildkite/lm-eval-harness/conftest.py b/.buildkite/lm-eval-harness/conftest.py index 769d2efda4adc..c0d60dd5328f4 100644 --- a/.buildkite/lm-eval-harness/conftest.py +++ b/.buildkite/lm-eval-harness/conftest.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from pathlib import Path import pytest diff --git a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py index 409a6ca820082..930adfaf3e192 100644 --- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py +++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ LM eval harness on model to compare vs HF baseline computed offline. Configs are found in configs/$MODEL.yaml diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py index 7f2a2d8dc2969..a4f1638c1adb8 100644 --- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py +++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json import os diff --git a/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py b/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py index 778a3a8d87f63..8532ff7ef798c 100644 --- a/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py +++ b/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse diff --git a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py index 10a7a2f5a467e..053fd52c35ae9 100644 --- a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py +++ b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import json diff --git a/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py b/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py index e5f179a0f5b68..ddea1d2b1b1ed 100644 --- a/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py +++ b/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from lmdeploy.serve.openai.api_client import APIClient diff --git a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py index 2a7b37991f31a..fb3b9d5e34e03 100644 --- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py +++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import datetime import json diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index 85e6eda7f36fd..ddb38e304cd65 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import io import json diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py index d86bf045ea47e..80a9246aa0b79 100644 --- a/benchmarks/benchmark_dataset.py +++ b/benchmarks/benchmark_dataset.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This module defines a framework for sampling benchmark requests from various datasets. Each dataset subclass of BenchmarkDataset must implement sample diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index de62bf5c63c76..c06857247eeed 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Benchmark the latency of processing a single batch of requests.""" import argparse diff --git a/benchmarks/benchmark_long_document_qa_throughput.py b/benchmarks/benchmark_long_document_qa_throughput.py index 109624c877891..00869fa94e71a 100644 --- a/benchmarks/benchmark_long_document_qa_throughput.py +++ b/benchmarks/benchmark_long_document_qa_throughput.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Offline benchmark to test the long document QA throughput. diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py index ffaa8035797c1..3e4704f0b8205 100644 --- a/benchmarks/benchmark_prefix_caching.py +++ b/benchmarks/benchmark_prefix_caching.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Benchmark the efficiency of prefix caching. diff --git a/benchmarks/benchmark_prioritization.py b/benchmarks/benchmark_prioritization.py index a05dd24dece83..5496703f23ccb 100644 --- a/benchmarks/benchmark_prioritization.py +++ b/benchmarks/benchmark_prioritization.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Benchmark offline prioritization.""" import argparse diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 6bd9f1b49c2ec..81428fb7dae12 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project r"""Benchmark online serving throughput. On the server side, run one of the following commands: diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py index 6a50f47d3951c..3848ebda959ac 100644 --- a/benchmarks/benchmark_serving_structured_output.py +++ b/benchmarks/benchmark_serving_structured_output.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project r"""Benchmark online serving throughput with structured outputs. On the server side, run one of the following commands: diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 7a13babda9d16..d19753d40e497 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Benchmark offline inference throughput.""" import argparse diff --git a/benchmarks/benchmark_utils.py b/benchmarks/benchmark_utils.py index b0c4fca92c3d0..272b7979cc551 100644 --- a/benchmarks/benchmark_utils.py +++ b/benchmarks/benchmark_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import json diff --git a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py index da258f98e085f..9ec270bbd2e98 100644 --- a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py +++ b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import copy diff --git a/benchmarks/cutlass_benchmarks/utils.py b/benchmarks/cutlass_benchmarks/utils.py index 7e9f5a7fc0f46..b4f3c6bf94eda 100644 --- a/benchmarks/cutlass_benchmarks/utils.py +++ b/benchmarks/cutlass_benchmarks/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Cutlass bench utils from collections.abc import Iterable diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py index 08e93837f7ddf..cec422e8d597f 100644 --- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py +++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import copy diff --git a/benchmarks/cutlass_benchmarks/weight_shapes.py b/benchmarks/cutlass_benchmarks/weight_shapes.py index d31b623a1ee60..25b96ef56620e 100644 --- a/benchmarks/cutlass_benchmarks/weight_shapes.py +++ b/benchmarks/cutlass_benchmarks/weight_shapes.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Weight Shapes are in the format # ([K, N], TP_SPLIT_DIM) diff --git a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py index fce156e1c96c6..f62d8102e2d9f 100644 --- a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py +++ b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os diff --git a/benchmarks/disagg_benchmarks/round_robin_proxy.py b/benchmarks/disagg_benchmarks/round_robin_proxy.py index fd19b40bf252c..b1df2f255822d 100644 --- a/benchmarks/disagg_benchmarks/round_robin_proxy.py +++ b/benchmarks/disagg_benchmarks/round_robin_proxy.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio import itertools diff --git a/benchmarks/disagg_benchmarks/visualize_benchmark_results.py b/benchmarks/disagg_benchmarks/visualize_benchmark_results.py index 484d0cb3cba7d..74fa56d076cf1 100644 --- a/benchmarks/disagg_benchmarks/visualize_benchmark_results.py +++ b/benchmarks/disagg_benchmarks/visualize_benchmark_results.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json diff --git a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py index 37a9173a1a937..901524214469e 100644 --- a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py +++ b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pickle as pkl import time diff --git a/benchmarks/kernels/bench_fp8_gemm.py b/benchmarks/kernels/bench_fp8_gemm.py index 36d03e40ef9a1..640a334190052 100644 --- a/benchmarks/kernels/bench_fp8_gemm.py +++ b/benchmarks/kernels/bench_fp8_gemm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import copy import itertools diff --git a/benchmarks/kernels/benchmark_aqlm.py b/benchmarks/kernels/benchmark_aqlm.py index e9934aa479dd6..42de062b08e42 100644 --- a/benchmarks/kernels/benchmark_aqlm.py +++ b/benchmarks/kernels/benchmark_aqlm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os import sys diff --git a/benchmarks/kernels/benchmark_bitblas.py b/benchmarks/kernels/benchmark_bitblas.py index d40ab70ec539b..97ee060341373 100644 --- a/benchmarks/kernels/benchmark_bitblas.py +++ b/benchmarks/kernels/benchmark_bitblas.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. diff --git a/benchmarks/kernels/benchmark_cutlass_fp4_moe.py b/benchmarks/kernels/benchmark_cutlass_fp4_moe.py index d39d8a6e3aba3..3383fb78872a2 100644 --- a/benchmarks/kernels/benchmark_cutlass_fp4_moe.py +++ b/benchmarks/kernels/benchmark_cutlass_fp4_moe.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Benchmark the performance of the cutlass_moe_fp4 kernel vs the triton_moe kernel. The cutlass_moe_fp4 kernel takes in fp4 quantized weights and 16-bit diff --git a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py index 2197bceabe6c0..1be83b84e95b8 100644 --- a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py +++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch import torch.utils.benchmark as benchmark diff --git a/benchmarks/kernels/benchmark_layernorm.py b/benchmarks/kernels/benchmark_layernorm.py index f21ca97eeb8a9..69978ec6b23e9 100644 --- a/benchmarks/kernels/benchmark_layernorm.py +++ b/benchmarks/kernels/benchmark_layernorm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import time diff --git a/benchmarks/kernels/benchmark_lora.py b/benchmarks/kernels/benchmark_lora.py index 6c1284930c1ec..3d38d4b3534e8 100644 --- a/benchmarks/kernels/benchmark_lora.py +++ b/benchmarks/kernels/benchmark_lora.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import copy diff --git a/benchmarks/kernels/benchmark_machete.py b/benchmarks/kernels/benchmark_machete.py index f8f1db04790bf..0f896f187ecb9 100644 --- a/benchmarks/kernels/benchmark_machete.py +++ b/benchmarks/kernels/benchmark_machete.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import copy diff --git a/benchmarks/kernels/benchmark_marlin.py b/benchmarks/kernels/benchmark_marlin.py index b17baff2e5f5d..9ea1fddae2a3b 100644 --- a/benchmarks/kernels/benchmark_marlin.py +++ b/benchmarks/kernels/benchmark_marlin.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch import torch.utils.benchmark as benchmark diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index c2f7660858f57..6cb55b35993ef 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import json diff --git a/benchmarks/kernels/benchmark_moe_permute_unpermute.py b/benchmarks/kernels/benchmark_moe_permute_unpermute.py index 333986fdf5eff..dba1f3943b96c 100644 --- a/benchmarks/kernels/benchmark_moe_permute_unpermute.py +++ b/benchmarks/kernels/benchmark_moe_permute_unpermute.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse from typing import Any, TypedDict diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py index 54f05e7232265..7e0376c18ecc7 100644 --- a/benchmarks/kernels/benchmark_paged_attention.py +++ b/benchmarks/kernels/benchmark_paged_attention.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import random import time diff --git a/benchmarks/kernels/benchmark_quant.py b/benchmarks/kernels/benchmark_quant.py index 2463dfebe83cc..6ab26f5f1adf7 100644 --- a/benchmarks/kernels/benchmark_quant.py +++ b/benchmarks/kernels/benchmark_quant.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import time diff --git a/benchmarks/kernels/benchmark_rmsnorm.py b/benchmarks/kernels/benchmark_rmsnorm.py index d720083b61503..4cf633a81358d 100644 --- a/benchmarks/kernels/benchmark_rmsnorm.py +++ b/benchmarks/kernels/benchmark_rmsnorm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import itertools from typing import Optional, Union diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py index 944024ca35725..b81baf17a8c67 100644 --- a/benchmarks/kernels/benchmark_rope.py +++ b/benchmarks/kernels/benchmark_rope.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from itertools import accumulate from typing import Optional diff --git a/benchmarks/kernels/benchmark_shapes.py b/benchmarks/kernels/benchmark_shapes.py index 70190ba24d9df..18c459c31d3f8 100644 --- a/benchmarks/kernels/benchmark_shapes.py +++ b/benchmarks/kernels/benchmark_shapes.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project WEIGHT_SHAPES = { "ideal": [[4 * 256 * 32, 256 * 32]], diff --git a/benchmarks/kernels/benchmark_w8a8_block_fp8.py b/benchmarks/kernels/benchmark_w8a8_block_fp8.py index 6315c1ee6cdd6..4fcdbadd65ecd 100644 --- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py +++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from sglang quantization/tuning_block_wise_kernel.py import argparse diff --git a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py index e377648254512..e67ce05453181 100644 --- a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py +++ b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # fmt: off # ruff: noqa: E501 import time diff --git a/benchmarks/kernels/graph_machete_bench.py b/benchmarks/kernels/graph_machete_bench.py index 0c86e40729579..9a4da0ef5a85d 100644 --- a/benchmarks/kernels/graph_machete_bench.py +++ b/benchmarks/kernels/graph_machete_bench.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import math import pickle diff --git a/benchmarks/kernels/utils.py b/benchmarks/kernels/utils.py index 877a29feed9df..4bbb36bb43592 100644 --- a/benchmarks/kernels/utils.py +++ b/benchmarks/kernels/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import dataclasses from collections.abc import Iterable diff --git a/benchmarks/kernels/weight_shapes.py b/benchmarks/kernels/weight_shapes.py index afe159ddda6e8..a27f02394afbd 100644 --- a/benchmarks/kernels/weight_shapes.py +++ b/benchmarks/kernels/weight_shapes.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Weight Shapes are in the format # ([K, N], TP_SPLIT_DIM) diff --git a/benchmarks/overheads/benchmark_hashing.py b/benchmarks/overheads/benchmark_hashing.py index d5701a8fbd6d8..0957a9c65f06c 100644 --- a/benchmarks/overheads/benchmark_hashing.py +++ b/benchmarks/overheads/benchmark_hashing.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import cProfile import pstats diff --git a/cmake/hipify.py b/cmake/hipify.py index a15577125eb1f..55d378f5b1113 100755 --- a/cmake/hipify.py +++ b/cmake/hipify.py @@ -1,5 +1,6 @@ #!/usr/bin/env python3 # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # # A command line tool for running pytorch's hipify preprocessor on CUDA diff --git a/csrc/cutlass_extensions/vllm_cutlass_library_extension.py b/csrc/cutlass_extensions/vllm_cutlass_library_extension.py index d64f0d0a5c2a0..1dd7101acc27d 100644 --- a/csrc/cutlass_extensions/vllm_cutlass_library_extension.py +++ b/csrc/cutlass_extensions/vllm_cutlass_library_extension.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import enum from typing import Union diff --git a/csrc/moe/marlin_moe_wna16/generate_kernels.py b/csrc/moe/marlin_moe_wna16/generate_kernels.py index 15f008d4f61ed..49f33718a21e8 100644 --- a/csrc/moe/marlin_moe_wna16/generate_kernels.py +++ b/csrc/moe/marlin_moe_wna16/generate_kernels.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import glob import itertools import os diff --git a/csrc/quantization/gptq_marlin/generate_kernels.py b/csrc/quantization/gptq_marlin/generate_kernels.py index 4ac7121ab4e1b..18fb6c1a81f84 100644 --- a/csrc/quantization/gptq_marlin/generate_kernels.py +++ b/csrc/quantization/gptq_marlin/generate_kernels.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import glob import itertools import os diff --git a/csrc/quantization/machete/generate.py b/csrc/quantization/machete/generate.py index 3114e14baa0c5..9af7833d09f32 100644 --- a/csrc/quantization/machete/generate.py +++ b/csrc/quantization/machete/generate.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import itertools import math diff --git a/docs/mkdocs/hooks/generate_examples.py b/docs/mkdocs/hooks/generate_examples.py index 6f290efe45c2f..7cfc89605150e 100644 --- a/docs/mkdocs/hooks/generate_examples.py +++ b/docs/mkdocs/hooks/generate_examples.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import itertools from dataclasses import dataclass, field from pathlib import Path diff --git a/docs/mkdocs/hooks/remove_announcement.py b/docs/mkdocs/hooks/remove_announcement.py index e5f8549d83837..f67941d2ad1b5 100644 --- a/docs/mkdocs/hooks/remove_announcement.py +++ b/docs/mkdocs/hooks/remove_announcement.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os from typing import Literal diff --git a/docs/mkdocs/hooks/url_schemes.py b/docs/mkdocs/hooks/url_schemes.py index c738828085ba7..6484581ed9478 100644 --- a/docs/mkdocs/hooks/url_schemes.py +++ b/docs/mkdocs/hooks/url_schemes.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import regex as re from mkdocs.config.defaults import MkDocsConfig from mkdocs.structure.files import Files diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py index 56cdd6861baa4..8e5cac78a4b20 100644 --- a/examples/offline_inference/audio_language.py +++ b/examples/offline_inference/audio_language.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This example shows how to use vLLM for running offline inference with the correct prompt format on audio language models. diff --git a/examples/offline_inference/automatic_prefix_caching.py b/examples/offline_inference/automatic_prefix_caching.py index 0d8c733042376..a01a9565a5fde 100644 --- a/examples/offline_inference/automatic_prefix_caching.py +++ b/examples/offline_inference/automatic_prefix_caching.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Demonstration script for Automatic Prefix Caching (APC) in vLLM. diff --git a/examples/offline_inference/basic/basic.py b/examples/offline_inference/basic/basic.py index ae5ae7cb48346..78bfda9bcf4e3 100644 --- a/examples/offline_inference/basic/basic.py +++ b/examples/offline_inference/basic/basic.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm import LLM, SamplingParams diff --git a/examples/offline_inference/basic/chat.py b/examples/offline_inference/basic/chat.py index b0bb5aa71b8a7..d078c517d00e7 100644 --- a/examples/offline_inference/basic/chat.py +++ b/examples/offline_inference/basic/chat.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm import LLM, EngineArgs from vllm.utils import FlexibleArgumentParser diff --git a/examples/offline_inference/basic/classify.py b/examples/offline_inference/basic/classify.py index 40ccb1294e424..219064e97429b 100644 --- a/examples/offline_inference/basic/classify.py +++ b/examples/offline_inference/basic/classify.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from argparse import Namespace diff --git a/examples/offline_inference/basic/embed.py b/examples/offline_inference/basic/embed.py index 38a73ccca251e..fc5ca23787be1 100644 --- a/examples/offline_inference/basic/embed.py +++ b/examples/offline_inference/basic/embed.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from argparse import Namespace diff --git a/examples/offline_inference/basic/generate.py b/examples/offline_inference/basic/generate.py index 72f4a8208386d..6a41ef4d84bb6 100644 --- a/examples/offline_inference/basic/generate.py +++ b/examples/offline_inference/basic/generate.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm import LLM, EngineArgs from vllm.utils import FlexibleArgumentParser diff --git a/examples/offline_inference/basic/score.py b/examples/offline_inference/basic/score.py index 3da73c6c407d4..6a08de2d2c38c 100644 --- a/examples/offline_inference/basic/score.py +++ b/examples/offline_inference/basic/score.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from argparse import Namespace diff --git a/examples/offline_inference/batch_llm_inference.py b/examples/offline_inference/batch_llm_inference.py index c1edfb52ff70c..b1c1ef620da8d 100644 --- a/examples/offline_inference/batch_llm_inference.py +++ b/examples/offline_inference/batch_llm_inference.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This example shows how to use Ray Data for data parallel batch inference. diff --git a/examples/offline_inference/chat_with_tools.py b/examples/offline_inference/chat_with_tools.py index 61230d8955842..6e56e24f2092c 100644 --- a/examples/offline_inference/chat_with_tools.py +++ b/examples/offline_inference/chat_with_tools.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # ruff: noqa import json diff --git a/examples/offline_inference/context_extension.py b/examples/offline_inference/context_extension.py index 1a70446c30a05..8d7666418559f 100644 --- a/examples/offline_inference/context_extension.py +++ b/examples/offline_inference/context_extension.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm import LLM, SamplingParams diff --git a/examples/offline_inference/data_parallel.py b/examples/offline_inference/data_parallel.py index 15906e1a2768d..3eccb4e11ab6f 100644 --- a/examples/offline_inference/data_parallel.py +++ b/examples/offline_inference/data_parallel.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Usage: Single node: diff --git a/examples/offline_inference/disaggregated-prefill-v1/decode_example.py b/examples/offline_inference/disaggregated-prefill-v1/decode_example.py index 4ae5d3310e0bf..8f3d1a5c00369 100644 --- a/examples/offline_inference/disaggregated-prefill-v1/decode_example.py +++ b/examples/offline_inference/disaggregated-prefill-v1/decode_example.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm import LLM, SamplingParams from vllm.config import KVTransferConfig diff --git a/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py b/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py index 5757a8a84b86a..0bfe7ec0e6cf6 100644 --- a/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py +++ b/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm import LLM, SamplingParams from vllm.config import KVTransferConfig diff --git a/examples/offline_inference/disaggregated_prefill.py b/examples/offline_inference/disaggregated_prefill.py index 3ccab0dcd6d32..05a361fee0717 100644 --- a/examples/offline_inference/disaggregated_prefill.py +++ b/examples/offline_inference/disaggregated_prefill.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This file demonstrates the example usage of disaggregated prefilling We will launch 2 vllm instances (GPU 0 for prefill and GPU 1 for decode), diff --git a/examples/offline_inference/eagle.py b/examples/offline_inference/eagle.py index 606ce7799a88f..ce977ee99bb8f 100644 --- a/examples/offline_inference/eagle.py +++ b/examples/offline_inference/eagle.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import json import os diff --git a/examples/offline_inference/embed_jina_embeddings_v3.py b/examples/offline_inference/embed_jina_embeddings_v3.py index 23f60c431fc24..e68128399ba21 100644 --- a/examples/offline_inference/embed_jina_embeddings_v3.py +++ b/examples/offline_inference/embed_jina_embeddings_v3.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from argparse import Namespace diff --git a/examples/offline_inference/embed_matryoshka_fy.py b/examples/offline_inference/embed_matryoshka_fy.py index 59c0592ae9e23..7f5d74d9a3ae0 100644 --- a/examples/offline_inference/embed_matryoshka_fy.py +++ b/examples/offline_inference/embed_matryoshka_fy.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from argparse import Namespace diff --git a/examples/offline_inference/encoder_decoder.py b/examples/offline_inference/encoder_decoder.py index 83dd1f667eb5f..0da6fa5c4af5f 100644 --- a/examples/offline_inference/encoder_decoder.py +++ b/examples/offline_inference/encoder_decoder.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Demonstrate prompting of text-to-text encoder/decoder models, specifically BART diff --git a/examples/offline_inference/encoder_decoder_multimodal.py b/examples/offline_inference/encoder_decoder_multimodal.py index ae3737e375941..d27a902edb7e7 100644 --- a/examples/offline_inference/encoder_decoder_multimodal.py +++ b/examples/offline_inference/encoder_decoder_multimodal.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This example shows how to use vLLM for running offline inference with the explicit/implicit prompt format on enc-dec LMMs for text generation. diff --git a/examples/offline_inference/llm_engine_example.py b/examples/offline_inference/llm_engine_example.py index 5d5e55a83d221..d7f2a1633113d 100644 --- a/examples/offline_inference/llm_engine_example.py +++ b/examples/offline_inference/llm_engine_example.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This file demonstrates using the `LLMEngine` for processing prompts with various sampling parameters. diff --git a/examples/offline_inference/load_sharded_state.py b/examples/offline_inference/load_sharded_state.py index 5bb2327a3f83e..cc78c0cbbf7c0 100644 --- a/examples/offline_inference/load_sharded_state.py +++ b/examples/offline_inference/load_sharded_state.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Validates the loading of a model saved with the sharded_state format. This script demonstrates how to load a model that was previously saved diff --git a/examples/offline_inference/lora_with_quantization_inference.py b/examples/offline_inference/lora_with_quantization_inference.py index 33c660015ba76..00d4cb9eb4c41 100644 --- a/examples/offline_inference/lora_with_quantization_inference.py +++ b/examples/offline_inference/lora_with_quantization_inference.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This example shows how to use LoRA with different quantization techniques for offline inference. diff --git a/examples/offline_inference/metrics.py b/examples/offline_inference/metrics.py index 7927f758cb575..00fb3f5bc8917 100644 --- a/examples/offline_inference/metrics.py +++ b/examples/offline_inference/metrics.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm import LLM, SamplingParams from vllm.v1.metrics.reader import Counter, Gauge, Histogram, Vector diff --git a/examples/offline_inference/mistral-small.py b/examples/offline_inference/mistral-small.py index 98fef2648f6bb..330103d5818a3 100644 --- a/examples/offline_inference/mistral-small.py +++ b/examples/offline_inference/mistral-small.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # ruff: noqa import argparse diff --git a/examples/offline_inference/mlpspeculator.py b/examples/offline_inference/mlpspeculator.py index b750397f45b8d..d5b1b4ad29a92 100644 --- a/examples/offline_inference/mlpspeculator.py +++ b/examples/offline_inference/mlpspeculator.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This file demonstrates the usage of text generation with an LLM model, comparing the performance with and without speculative decoding. diff --git a/examples/offline_inference/multilora_inference.py b/examples/offline_inference/multilora_inference.py index 1fa2f16f82a8a..f0c00bcaaeb11 100644 --- a/examples/offline_inference/multilora_inference.py +++ b/examples/offline_inference/multilora_inference.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This example shows how to use the multi-LoRA functionality for offline inference. diff --git a/examples/offline_inference/neuron.py b/examples/offline_inference/neuron.py index f2d7698f22d7c..7826629a36d01 100644 --- a/examples/offline_inference/neuron.py +++ b/examples/offline_inference/neuron.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm import LLM, SamplingParams diff --git a/examples/offline_inference/neuron_eagle.py b/examples/offline_inference/neuron_eagle.py index 5d7fb819d3477..0b2070c8e2531 100644 --- a/examples/offline_inference/neuron_eagle.py +++ b/examples/offline_inference/neuron_eagle.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This example shows how to run offline inference with an EAGLE speculative decoding model on neuron. To use EAGLE speculative decoding, you must use diff --git a/examples/offline_inference/neuron_int8_quantization.py b/examples/offline_inference/neuron_int8_quantization.py index ec38525b9daf2..c0ecfac508996 100644 --- a/examples/offline_inference/neuron_int8_quantization.py +++ b/examples/offline_inference/neuron_int8_quantization.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os diff --git a/examples/offline_inference/neuron_multimodal.py b/examples/offline_inference/neuron_multimodal.py index a9478650b16f1..6ff8faabd748b 100644 --- a/examples/offline_inference/neuron_multimodal.py +++ b/examples/offline_inference/neuron_multimodal.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import requests import torch from neuronx_distributed_inference.models.mllama.utils import add_instruct diff --git a/examples/offline_inference/neuron_speculation.py b/examples/offline_inference/neuron_speculation.py index ecacbab771c2a..2ef69f29863d7 100644 --- a/examples/offline_inference/neuron_speculation.py +++ b/examples/offline_inference/neuron_speculation.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This example shows how to run offline inference with a speculative decoding model on neuron. diff --git a/examples/offline_inference/prefix_caching.py b/examples/offline_inference/prefix_caching.py index d3dad24956a69..6998913823947 100644 --- a/examples/offline_inference/prefix_caching.py +++ b/examples/offline_inference/prefix_caching.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm import LLM, SamplingParams from vllm.distributed import cleanup_dist_env_and_memory diff --git a/examples/offline_inference/prithvi_geospatial_mae.py b/examples/offline_inference/prithvi_geospatial_mae.py index 21f7668adc863..567c448a8c97b 100644 --- a/examples/offline_inference/prithvi_geospatial_mae.py +++ b/examples/offline_inference/prithvi_geospatial_mae.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This is a demo script showing how to use the PrithviGeospatialMAE model with vLLM diff --git a/examples/offline_inference/profiling.py b/examples/offline_inference/profiling.py index 244a64b891c96..392fba8fc5ead 100644 --- a/examples/offline_inference/profiling.py +++ b/examples/offline_inference/profiling.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import inspect import json diff --git a/examples/offline_inference/profiling_tpu/profiling.py b/examples/offline_inference/profiling_tpu/profiling.py index 82737d538df4f..5200be82694ab 100644 --- a/examples/offline_inference/profiling_tpu/profiling.py +++ b/examples/offline_inference/profiling_tpu/profiling.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import dataclasses diff --git a/examples/offline_inference/prompt_embed_inference.py b/examples/offline_inference/prompt_embed_inference.py index 9f6a602233f8a..5d79222a1bb3a 100644 --- a/examples/offline_inference/prompt_embed_inference.py +++ b/examples/offline_inference/prompt_embed_inference.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Demonstrates how to generate prompt embeddings using Hugging Face Transformers and use them as input to vLLM diff --git a/examples/offline_inference/qwen2_5_omni/only_thinker.py b/examples/offline_inference/qwen2_5_omni/only_thinker.py index 6482490d1a93a..62effd5c8b62e 100644 --- a/examples/offline_inference/qwen2_5_omni/only_thinker.py +++ b/examples/offline_inference/qwen2_5_omni/only_thinker.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This example shows how to use vLLM for running offline inference with the correct prompt format on Qwen2.5-Omni (thinker only). diff --git a/examples/offline_inference/qwen_1m.py b/examples/offline_inference/qwen_1m.py index 856a35b0e59be..d8d61667f688b 100644 --- a/examples/offline_inference/qwen_1m.py +++ b/examples/offline_inference/qwen_1m.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os from urllib.request import urlopen diff --git a/examples/offline_inference/reproducibility.py b/examples/offline_inference/reproducibility.py index 6d048986e7109..d909438b41042 100644 --- a/examples/offline_inference/reproducibility.py +++ b/examples/offline_inference/reproducibility.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Demonstrates how to achieve reproducibility in vLLM. diff --git a/examples/offline_inference/rlhf.py b/examples/offline_inference/rlhf.py index a8f6977e29a49..c6e63531a99d1 100644 --- a/examples/offline_inference/rlhf.py +++ b/examples/offline_inference/rlhf.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ a simple demonstration of RLHF with vLLM, inspired by the OpenRLHF framework https://github.com/OpenRLHF/OpenRLHF . diff --git a/examples/offline_inference/rlhf_colocate.py b/examples/offline_inference/rlhf_colocate.py index 76eafdca1f6c7..096363e683017 100644 --- a/examples/offline_inference/rlhf_colocate.py +++ b/examples/offline_inference/rlhf_colocate.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ a simple demonstration to show how to co-locate vLLM worker with training actors on the same GPUs, diff --git a/examples/offline_inference/rlhf_utils.py b/examples/offline_inference/rlhf_utils.py index 3461af707eba8..c445224d75686 100644 --- a/examples/offline_inference/rlhf_utils.py +++ b/examples/offline_inference/rlhf_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch diff --git a/examples/offline_inference/save_sharded_state.py b/examples/offline_inference/save_sharded_state.py index 860fe2b5fe067..9b154e370642b 100644 --- a/examples/offline_inference/save_sharded_state.py +++ b/examples/offline_inference/save_sharded_state.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Saves each worker's model state dict directly to a checkpoint, which enables a fast load path for large tensor-parallel models where each worker only needs to diff --git a/examples/offline_inference/simple_profiling.py b/examples/offline_inference/simple_profiling.py index d583110c8e69b..46858fffadc52 100644 --- a/examples/offline_inference/simple_profiling.py +++ b/examples/offline_inference/simple_profiling.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os import time diff --git a/examples/offline_inference/structured_outputs.py b/examples/offline_inference/structured_outputs.py index 9ed7299606b7e..8ef121ebe848e 100644 --- a/examples/offline_inference/structured_outputs.py +++ b/examples/offline_inference/structured_outputs.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This file demonstrates the example usage of guided decoding to generate structured outputs using vLLM. It shows how to apply diff --git a/examples/offline_inference/torchrun_example.py b/examples/offline_inference/torchrun_example.py index 2fa49c0835e32..3d3d7946cdb41 100644 --- a/examples/offline_inference/torchrun_example.py +++ b/examples/offline_inference/torchrun_example.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ experimental support for tensor-parallel inference with torchrun, see https://github.com/vllm-project/vllm/issues/11400 for diff --git a/examples/offline_inference/tpu.py b/examples/offline_inference/tpu.py index f3c2859d44d17..9776f4fe322b9 100644 --- a/examples/offline_inference/tpu.py +++ b/examples/offline_inference/tpu.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import os diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index 2ef87f4f4696e..15dbd9f44128a 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This example shows how to use vLLM for running offline inference with the correct prompt format on vision language models for text generation. diff --git a/examples/offline_inference/vision_language_embedding.py b/examples/offline_inference/vision_language_embedding.py index cee02d06c607c..1f5bd4ad72b05 100644 --- a/examples/offline_inference/vision_language_embedding.py +++ b/examples/offline_inference/vision_language_embedding.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This example shows how to use vLLM for running offline inference with the correct prompt format on vision language models for multimodal embedding. diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py index 7ce28c5a4f09f..de6365c0d8581 100644 --- a/examples/offline_inference/vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This example shows how to use vLLM for running offline inference with multi-image input on vision language models for text generation, diff --git a/examples/online_serving/api_client.py b/examples/online_serving/api_client.py index cc190e91c141d..84854911bade1 100644 --- a/examples/online_serving/api_client.py +++ b/examples/online_serving/api_client.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Example Python client for `vllm.entrypoints.api_server` Start the demo server: python -m vllm.entrypoints.api_server --model diff --git a/examples/online_serving/cohere_rerank_client.py b/examples/online_serving/cohere_rerank_client.py index e57b94e8805f9..63c9ff9e93980 100644 --- a/examples/online_serving/cohere_rerank_client.py +++ b/examples/online_serving/cohere_rerank_client.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Example of using the OpenAI entrypoint's rerank API which is compatible with the Cohere SDK: https://github.com/cohere-ai/cohere-python diff --git a/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py b/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py index 2ffba4a7ed3f9..16c32dcaa5d31 100644 --- a/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py +++ b/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This file provides a disaggregated prefilling proxy demo to demonstrate an example usage of XpYd disaggregated prefilling. diff --git a/examples/online_serving/gradio_openai_chatbot_webserver.py b/examples/online_serving/gradio_openai_chatbot_webserver.py index 3f2a3d01b4563..d5d0a07a29183 100644 --- a/examples/online_serving/gradio_openai_chatbot_webserver.py +++ b/examples/online_serving/gradio_openai_chatbot_webserver.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Example for starting a Gradio OpenAI Chatbot Webserver Start vLLM API server: vllm serve meta-llama/Llama-2-7b-chat-hf diff --git a/examples/online_serving/gradio_webserver.py b/examples/online_serving/gradio_webserver.py index fd341ff493b56..86d9ceb48bb04 100644 --- a/examples/online_serving/gradio_webserver.py +++ b/examples/online_serving/gradio_webserver.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Example for starting a Gradio Webserver Start vLLM API server: python -m vllm.entrypoints.api_server \ diff --git a/examples/online_serving/jinaai_rerank_client.py b/examples/online_serving/jinaai_rerank_client.py index 7eb3d2193f41b..908d6a9240aa9 100644 --- a/examples/online_serving/jinaai_rerank_client.py +++ b/examples/online_serving/jinaai_rerank_client.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Example of using the OpenAI entrypoint's rerank API which is compatible with Jina and Cohere https://jina.ai/reranker diff --git a/examples/online_serving/kv_events_subscriber.py b/examples/online_serving/kv_events_subscriber.py index 65d74dccab807..584db53db4e40 100644 --- a/examples/online_serving/kv_events_subscriber.py +++ b/examples/online_serving/kv_events_subscriber.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any, Optional, Union import msgspec diff --git a/examples/online_serving/openai_chat_completion_client.py b/examples/online_serving/openai_chat_completion_client.py index 2856e3be3e2dd..def95deb0c95d 100644 --- a/examples/online_serving/openai_chat_completion_client.py +++ b/examples/online_serving/openai_chat_completion_client.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Example Python client for OpenAI Chat Completion using vLLM API server NOTE: start a supported chat completion model server with `vllm serve`, e.g. vllm serve meta-llama/Llama-2-7b-chat-hf diff --git a/examples/online_serving/openai_chat_completion_client_for_multimodal.py b/examples/online_serving/openai_chat_completion_client_for_multimodal.py index 8c3c6ecdd4b01..c99b5148de875 100644 --- a/examples/online_serving/openai_chat_completion_client_for_multimodal.py +++ b/examples/online_serving/openai_chat_completion_client_for_multimodal.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """An example showing how to use vLLM to serve multimodal models and run online serving with OpenAI client. diff --git a/examples/online_serving/openai_chat_completion_client_with_tools.py b/examples/online_serving/openai_chat_completion_client_with_tools.py index a0d7841f644fc..41dbb3236297c 100644 --- a/examples/online_serving/openai_chat_completion_client_with_tools.py +++ b/examples/online_serving/openai_chat_completion_client_with_tools.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Set up this example by starting a vLLM OpenAI-compatible server with tool call options enabled. For example: diff --git a/examples/online_serving/openai_chat_completion_client_with_tools_required.py b/examples/online_serving/openai_chat_completion_client_with_tools_required.py index 45c4232fe1dea..7eb8668213eef 100644 --- a/examples/online_serving/openai_chat_completion_client_with_tools_required.py +++ b/examples/online_serving/openai_chat_completion_client_with_tools_required.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ To run this example, you can start the vLLM server without any specific flags: diff --git a/examples/online_serving/openai_chat_completion_structured_outputs.py b/examples/online_serving/openai_chat_completion_structured_outputs.py index a4134ea43c4b3..64379083dcca8 100644 --- a/examples/online_serving/openai_chat_completion_structured_outputs.py +++ b/examples/online_serving/openai_chat_completion_structured_outputs.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ To run this example, you need to start the vLLM server: diff --git a/examples/online_serving/openai_chat_completion_structured_outputs_structural_tag.py b/examples/online_serving/openai_chat_completion_structured_outputs_structural_tag.py index c73208abe6005..ec7d8b95472e6 100644 --- a/examples/online_serving/openai_chat_completion_structured_outputs_structural_tag.py +++ b/examples/online_serving/openai_chat_completion_structured_outputs_structural_tag.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from openai import OpenAI # This example demonstrates the `structural_tag` response format. diff --git a/examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py b/examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py index 1ca61a8d5895f..bfbee7513874a 100644 --- a/examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py +++ b/examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ An example shows how to generate structured outputs from reasoning models like DeepSeekR1. The thinking process will not be guided by the JSON diff --git a/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py b/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py index a5febad45863b..4006d07f73b00 100644 --- a/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py +++ b/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ An example demonstrates how to use tool calling with reasoning models like QwQ-32B. The reasoning_content will not be parsed by the tool diff --git a/examples/online_serving/openai_chat_completion_with_reasoning.py b/examples/online_serving/openai_chat_completion_with_reasoning.py index f6b8082115f12..932dbeb2e7a24 100644 --- a/examples/online_serving/openai_chat_completion_with_reasoning.py +++ b/examples/online_serving/openai_chat_completion_with_reasoning.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ An example shows how to generate chat completions from reasoning models like DeepSeekR1. diff --git a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py index f984fbabf24fd..5a91929770945 100644 --- a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py +++ b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ An example shows how to generate chat completions from reasoning models like DeepSeekR1. diff --git a/examples/online_serving/openai_chat_embedding_client_for_multimodal.py b/examples/online_serving/openai_chat_embedding_client_for_multimodal.py index ee519e555ff7f..70f3c2f19cf14 100644 --- a/examples/online_serving/openai_chat_embedding_client_for_multimodal.py +++ b/examples/online_serving/openai_chat_embedding_client_for_multimodal.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import base64 diff --git a/examples/online_serving/openai_classification_client.py b/examples/online_serving/openai_classification_client.py index 649cfa5d6686b..b10e7acbd26c1 100644 --- a/examples/online_serving/openai_classification_client.py +++ b/examples/online_serving/openai_classification_client.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import pprint diff --git a/examples/online_serving/openai_completion_client.py b/examples/online_serving/openai_completion_client.py index b1d21b5e4b9f7..df6e4e9429650 100644 --- a/examples/online_serving/openai_completion_client.py +++ b/examples/online_serving/openai_completion_client.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse diff --git a/examples/online_serving/openai_cross_encoder_score.py b/examples/online_serving/openai_cross_encoder_score.py index 7891e14cb71e2..2e0d168d615c6 100644 --- a/examples/online_serving/openai_cross_encoder_score.py +++ b/examples/online_serving/openai_cross_encoder_score.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Example online usage of Score API. diff --git a/examples/online_serving/openai_embedding_client.py b/examples/online_serving/openai_embedding_client.py index a055654e91332..6bc390861e2ee 100644 --- a/examples/online_serving/openai_embedding_client.py +++ b/examples/online_serving/openai_embedding_client.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from openai import OpenAI diff --git a/examples/online_serving/openai_embedding_matryoshka_fy.py b/examples/online_serving/openai_embedding_matryoshka_fy.py index 4544dcfb5ab09..653da8d18b705 100644 --- a/examples/online_serving/openai_embedding_matryoshka_fy.py +++ b/examples/online_serving/openai_embedding_matryoshka_fy.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Example Python client for embedding API dimensions using vLLM API server NOTE: start a supported Matryoshka Embeddings model server with `vllm serve`, e.g. diff --git a/examples/online_serving/openai_pooling_client.py b/examples/online_serving/openai_pooling_client.py index 2620a12320241..8252b36705cc6 100644 --- a/examples/online_serving/openai_pooling_client.py +++ b/examples/online_serving/openai_pooling_client.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Example online usage of Pooling API. diff --git a/examples/online_serving/openai_transcription_client.py b/examples/online_serving/openai_transcription_client.py index eb501ae72aa9f..12d45de3c81b0 100644 --- a/examples/online_serving/openai_transcription_client.py +++ b/examples/online_serving/openai_transcription_client.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio import json diff --git a/examples/online_serving/opentelemetry/dummy_client.py b/examples/online_serving/opentelemetry/dummy_client.py index 33d365f0caa56..018d986ad8732 100644 --- a/examples/online_serving/opentelemetry/dummy_client.py +++ b/examples/online_serving/opentelemetry/dummy_client.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import requests from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter diff --git a/examples/online_serving/prompt_embed_inference_with_openai_client.py b/examples/online_serving/prompt_embed_inference_with_openai_client.py index 85ea2340736e8..3a90421383775 100644 --- a/examples/online_serving/prompt_embed_inference_with_openai_client.py +++ b/examples/online_serving/prompt_embed_inference_with_openai_client.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ vLLM OpenAI-Compatible Client with Prompt Embeddings diff --git a/examples/online_serving/ray_serve_deepseek.py b/examples/online_serving/ray_serve_deepseek.py index a76020130c3ac..9471563ddb76b 100644 --- a/examples/online_serving/ray_serve_deepseek.py +++ b/examples/online_serving/ray_serve_deepseek.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Example to deploy DeepSeek R1 or V3 with Ray Serve LLM. See more details at: diff --git a/examples/online_serving/retrieval_augmented_generation_with_langchain.py b/examples/online_serving/retrieval_augmented_generation_with_langchain.py index 37af3b3887f57..d9a4cadb036e2 100644 --- a/examples/online_serving/retrieval_augmented_generation_with_langchain.py +++ b/examples/online_serving/retrieval_augmented_generation_with_langchain.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Retrieval Augmented Generation (RAG) Implementation with Langchain ================================================================== diff --git a/examples/online_serving/retrieval_augmented_generation_with_llamaindex.py b/examples/online_serving/retrieval_augmented_generation_with_llamaindex.py index 08796b1b3a546..be4796acd1b67 100644 --- a/examples/online_serving/retrieval_augmented_generation_with_llamaindex.py +++ b/examples/online_serving/retrieval_augmented_generation_with_llamaindex.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ RAG (Retrieval Augmented Generation) Implementation with LlamaIndex ================================================================ diff --git a/examples/online_serving/streamlit_openai_chatbot_webserver.py b/examples/online_serving/streamlit_openai_chatbot_webserver.py index 0722aa671f66b..dab56172ee3a3 100644 --- a/examples/online_serving/streamlit_openai_chatbot_webserver.py +++ b/examples/online_serving/streamlit_openai_chatbot_webserver.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ vLLM Chat Assistant - A Streamlit Web Interface diff --git a/examples/online_serving/utils.py b/examples/online_serving/utils.py index 0781a27f19c51..a512d8a31b53e 100644 --- a/examples/online_serving/utils.py +++ b/examples/online_serving/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from openai import APIConnectionError, OpenAI from openai.pagination import SyncPage from openai.types.model import Model diff --git a/examples/others/lmcache/cpu_offload_lmcache.py b/examples/others/lmcache/cpu_offload_lmcache.py index 98eafb31ed4f1..354e4cc8c5723 100644 --- a/examples/others/lmcache/cpu_offload_lmcache.py +++ b/examples/others/lmcache/cpu_offload_lmcache.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This file demonstrates the example usage of cpu offloading with LMCache in vLLM v1 or v0. diff --git a/examples/others/lmcache/disagg_prefill_lmcache_v0.py b/examples/others/lmcache/disagg_prefill_lmcache_v0.py index b2b7b3b2c1f97..6669eb3fb3d38 100644 --- a/examples/others/lmcache/disagg_prefill_lmcache_v0.py +++ b/examples/others/lmcache/disagg_prefill_lmcache_v0.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This file demonstrates the example usage of disaggregated prefilling with LMCache. diff --git a/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py b/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py index 20155c2036580..5d8e38c73b89a 100644 --- a/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py +++ b/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import os diff --git a/examples/others/lmcache/kv_cache_sharing_lmcache_v1.py b/examples/others/lmcache/kv_cache_sharing_lmcache_v1.py index 89945d67a6f38..508cf4a5a4987 100644 --- a/examples/others/lmcache/kv_cache_sharing_lmcache_v1.py +++ b/examples/others/lmcache/kv_cache_sharing_lmcache_v1.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This file demonstrates the example usage of remote KV cache sharing with LMCache. diff --git a/examples/others/tensorize_vllm_model.py b/examples/others/tensorize_vllm_model.py index 1757776308334..9e1003a5c39d0 100644 --- a/examples/others/tensorize_vllm_model.py +++ b/examples/others/tensorize_vllm_model.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import dataclasses diff --git a/find_cuda_init.py b/find_cuda_init.py index 0d13b2f862102..308fc6fc2d61c 100644 --- a/find_cuda_init.py +++ b/find_cuda_init.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import importlib import traceback diff --git a/setup.py b/setup.py index c190864dda94e..b07cdea302900 100644 --- a/setup.py +++ b/setup.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import ctypes import importlib.util diff --git a/tests/async_engine/api_server_async_engine.py b/tests/async_engine/api_server_async_engine.py index 1e3c2d1a473a3..163185b90be91 100644 --- a/tests/async_engine/api_server_async_engine.py +++ b/tests/async_engine/api_server_async_engine.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """vllm.entrypoints.api_server with some extra logging for testing.""" from collections.abc import Iterable from typing import Any diff --git a/tests/async_engine/conftest.py b/tests/async_engine/conftest.py index 1a20e2c135c2e..375b248ebedaa 100644 --- a/tests/async_engine/conftest.py +++ b/tests/async_engine/conftest.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/async_engine/test_api_server.py b/tests/async_engine/test_api_server.py index 410cece795e94..38ecaf2233d99 100644 --- a/tests/async_engine/test_api_server.py +++ b/tests/async_engine/test_api_server.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os import subprocess diff --git a/tests/async_engine/test_async_llm_engine.py b/tests/async_engine/test_async_llm_engine.py index b6f44871497c8..1a31bdbfccb34 100644 --- a/tests/async_engine/test_async_llm_engine.py +++ b/tests/async_engine/test_async_llm_engine.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio import os diff --git a/tests/async_engine/test_request_tracker.py b/tests/async_engine/test_request_tracker.py index fd6d89d4e00de..1851eeeda7905 100644 --- a/tests/async_engine/test_request_tracker.py +++ b/tests/async_engine/test_request_tracker.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py index 11c8e7a4b9d1c..46be4a3c3e851 100644 --- a/tests/basic_correctness/test_basic_correctness.py +++ b/tests/basic_correctness/test_basic_correctness.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Compare the short outputs of HF and vLLM when using greedy sampling. Run `pytest tests/basic_correctness/test_basic_correctness.py`. diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py index 06c9e25ed8dd8..eb5b09ff74f60 100644 --- a/tests/basic_correctness/test_chunked_prefill.py +++ b/tests/basic_correctness/test_chunked_prefill.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Compare the outputs of HF and vLLM when using greedy sampling. It tests chunked prefill. Chunked prefill can be enabled by diff --git a/tests/basic_correctness/test_cpu_offload.py b/tests/basic_correctness/test_cpu_offload.py index be3ad12396b4b..28bfe9e7c8020 100644 --- a/tests/basic_correctness/test_cpu_offload.py +++ b/tests/basic_correctness/test_cpu_offload.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from ..utils import compare_two_settings diff --git a/tests/basic_correctness/test_cumem.py b/tests/basic_correctness/test_cumem.py index 76b266aada684..34f9389c82a9b 100644 --- a/tests/basic_correctness/test_cumem.py +++ b/tests/basic_correctness/test_cumem.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch diff --git a/tests/basic_correctness/test_preemption.py b/tests/basic_correctness/test_preemption.py index 63dc0f8c8e3b2..341a39a42b85e 100644 --- a/tests/basic_correctness/test_preemption.py +++ b/tests/basic_correctness/test_preemption.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Compare the short outputs of HF and vLLM when using greedy sampling. VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 has to be set before running this test. diff --git a/tests/benchmarks/test_latency_cli.py b/tests/benchmarks/test_latency_cli.py index 8537459b9f94d..2279c846e01cd 100644 --- a/tests/benchmarks/test_latency_cli.py +++ b/tests/benchmarks/test_latency_cli.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import subprocess import pytest diff --git a/tests/benchmarks/test_serve_cli.py b/tests/benchmarks/test_serve_cli.py index b746d6b7853c9..a3181952677fd 100644 --- a/tests/benchmarks/test_serve_cli.py +++ b/tests/benchmarks/test_serve_cli.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import subprocess import pytest diff --git a/tests/benchmarks/test_throughput_cli.py b/tests/benchmarks/test_throughput_cli.py index 2045b36293565..b61e51db4fbe4 100644 --- a/tests/benchmarks/test_throughput_cli.py +++ b/tests/benchmarks/test_throughput_cli.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import subprocess import pytest diff --git a/tests/build_cython.py b/tests/build_cython.py index 9dea6bcd62f3f..f4a334aa3b484 100644 --- a/tests/build_cython.py +++ b/tests/build_cython.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import Cython.Compiler.Options from Cython.Build import cythonize from setuptools import setup diff --git a/tests/compile/backend.py b/tests/compile/backend.py index 5a02c4e2b3782..60334f5e4f683 100644 --- a/tests/compile/backend.py +++ b/tests/compile/backend.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from copy import deepcopy from typing import Callable, Union diff --git a/tests/compile/conftest.py b/tests/compile/conftest.py index 7118810a58614..d86ca37109237 100644 --- a/tests/compile/conftest.py +++ b/tests/compile/conftest.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/compile/piecewise/test_full_cudagraph.py b/tests/compile/piecewise/test_full_cudagraph.py index a71a40cda73ea..3188ea40f9ee6 100644 --- a/tests/compile/piecewise/test_full_cudagraph.py +++ b/tests/compile/piecewise/test_full_cudagraph.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import contextlib import os diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py index 5ce520a440257..852aa44d47aa5 100644 --- a/tests/compile/piecewise/test_simple.py +++ b/tests/compile/piecewise/test_simple.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Test the piecewise compilation with a simple model so that we can exactly calculate the expected output and side effects. diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py index 22560befcbd56..2464d7889861f 100644 --- a/tests/compile/piecewise/test_toy_llama.py +++ b/tests/compile/piecewise/test_toy_llama.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Test the piecewise compilation with a simple model, comparing the output with and without the piecewise compilation. diff --git a/tests/compile/test_async_tp.py b/tests/compile/test_async_tp.py index 8e4e0ba835793..1e4ee571f1af5 100644 --- a/tests/compile/test_async_tp.py +++ b/tests/compile/test_async_tp.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py index b6b45d1cbe880..dc6cfe9daccdc 100644 --- a/tests/compile/test_basic_correctness.py +++ b/tests/compile/test_basic_correctness.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from __future__ import annotations import dataclasses diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py index 397517b8665bc..1d000fe00c598 100644 --- a/tests/compile/test_full_graph.py +++ b/tests/compile/test_full_graph.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from __future__ import annotations diff --git a/tests/compile/test_functionalization.py b/tests/compile/test_functionalization.py index 5d38ff91490ee..aade29b99de7e 100644 --- a/tests/compile/test_functionalization.py +++ b/tests/compile/test_functionalization.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py index 509593e7328de..0c25aae52d465 100644 --- a/tests/compile/test_fusion.py +++ b/tests/compile/test_fusion.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch diff --git a/tests/compile/test_pass_manager.py b/tests/compile/test_pass_manager.py index b630d0e85d31a..251cc46e9e989 100644 --- a/tests/compile/test_pass_manager.py +++ b/tests/compile/test_pass_manager.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import copy import pytest diff --git a/tests/compile/test_sequence_parallelism.py b/tests/compile/test_sequence_parallelism.py index 2cd7ebaacec00..c689befdf2da6 100644 --- a/tests/compile/test_sequence_parallelism.py +++ b/tests/compile/test_sequence_parallelism.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch diff --git a/tests/compile/test_silu_mul_quant_fusion.py b/tests/compile/test_silu_mul_quant_fusion.py index 9eae48d60f368..df36b86abdbe4 100644 --- a/tests/compile/test_silu_mul_quant_fusion.py +++ b/tests/compile/test_silu_mul_quant_fusion.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch diff --git a/tests/compile/test_wrapper.py b/tests/compile/test_wrapper.py index 0934c61135792..5e39f6821d16c 100644 --- a/tests/compile/test_wrapper.py +++ b/tests/compile/test_wrapper.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/tests/conftest.py b/tests/conftest.py index 6336c6c2ce011..5ec3926bd31f4 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json import os import tempfile diff --git a/tests/core/block/conftest.py b/tests/core/block/conftest.py index b7a9863f4aaf5..6afe98d78ce81 100644 --- a/tests/core/block/conftest.py +++ b/tests/core/block/conftest.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/core/block/e2e/conftest.py b/tests/core/block/e2e/conftest.py index 83259b690337a..e2c6c66b259c8 100644 --- a/tests/core/block/e2e/conftest.py +++ b/tests/core/block/e2e/conftest.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable from typing import Callable, Optional diff --git a/tests/core/block/e2e/test_correctness.py b/tests/core/block/e2e/test_correctness.py index 9e8e315d87b18..f296c81e17685 100644 --- a/tests/core/block/e2e/test_correctness.py +++ b/tests/core/block/e2e/test_correctness.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from itertools import cycle diff --git a/tests/core/block/e2e/test_correctness_sliding_window.py b/tests/core/block/e2e/test_correctness_sliding_window.py index 039b5e739892a..3429a858dda59 100644 --- a/tests/core/block/e2e/test_correctness_sliding_window.py +++ b/tests/core/block/e2e/test_correctness_sliding_window.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import random diff --git a/tests/core/block/test_block_manager.py b/tests/core/block/test_block_manager.py index 68d9618ae245b..9eed264fd7d43 100644 --- a/tests/core/block/test_block_manager.py +++ b/tests/core/block/test_block_manager.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/core/block/test_block_table.py b/tests/core/block/test_block_table.py index 250c9a7497d23..ba085001136be 100644 --- a/tests/core/block/test_block_table.py +++ b/tests/core/block/test_block_table.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/core/block/test_common.py b/tests/core/block/test_common.py index 20260873003df..65400899b811c 100644 --- a/tests/core/block/test_common.py +++ b/tests/core/block/test_common.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import random diff --git a/tests/core/block/test_cpu_gpu_block_allocator.py b/tests/core/block/test_cpu_gpu_block_allocator.py index a1414edd95622..795eef6743fd1 100644 --- a/tests/core/block/test_cpu_gpu_block_allocator.py +++ b/tests/core/block/test_cpu_gpu_block_allocator.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/core/block/test_naive_block.py b/tests/core/block/test_naive_block.py index 4b9454c84ff65..a31d1c46b37f0 100644 --- a/tests/core/block/test_naive_block.py +++ b/tests/core/block/test_naive_block.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/tests/core/block/test_prefix_caching_block.py b/tests/core/block/test_prefix_caching_block.py index 50233624f7d17..46e224c6f53b2 100644 --- a/tests/core/block/test_prefix_caching_block.py +++ b/tests/core/block/test_prefix_caching_block.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import math import random diff --git a/tests/core/conftest.py b/tests/core/conftest.py index 1a20e2c135c2e..375b248ebedaa 100644 --- a/tests/core/conftest.py +++ b/tests/core/conftest.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/core/test_chunked_prefill_scheduler.py b/tests/core/test_chunked_prefill_scheduler.py index 161b32f01b111..d4dacc4f1296d 100644 --- a/tests/core/test_chunked_prefill_scheduler.py +++ b/tests/core/test_chunked_prefill_scheduler.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from unittest.mock import MagicMock diff --git a/tests/core/test_num_computed_tokens_update.py b/tests/core/test_num_computed_tokens_update.py index a4a90144482bb..1b958e34df870 100644 --- a/tests/core/test_num_computed_tokens_update.py +++ b/tests/core/test_num_computed_tokens_update.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py index a5ba16898d891..db78a9d556422 100644 --- a/tests/core/test_scheduler.py +++ b/tests/core/test_scheduler.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import time from collections import deque diff --git a/tests/core/test_scheduler_encoder_decoder.py b/tests/core/test_scheduler_encoder_decoder.py index c6049b26a2bcd..20cc083ec8db4 100644 --- a/tests/core/test_scheduler_encoder_decoder.py +++ b/tests/core/test_scheduler_encoder_decoder.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest # noqa diff --git a/tests/core/test_serialization.py b/tests/core/test_serialization.py index 64b3e148ee728..8281298d6634c 100644 --- a/tests/core/test_serialization.py +++ b/tests/core/test_serialization.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import msgspec diff --git a/tests/core/utils.py b/tests/core/utils.py index 84b0426b470bc..b746c17864641 100644 --- a/tests/core/utils.py +++ b/tests/core/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import time from collections import defaultdict diff --git a/tests/detokenizer/conftest.py b/tests/detokenizer/conftest.py index 59394b0351bda..f2c125355c83c 100644 --- a/tests/detokenizer/conftest.py +++ b/tests/detokenizer/conftest.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/detokenizer/test_disable_detokenization.py b/tests/detokenizer/test_disable_detokenization.py index 14f9babb8d8a6..ae06a985c7ecd 100644 --- a/tests/detokenizer/test_disable_detokenization.py +++ b/tests/detokenizer/test_disable_detokenization.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/detokenizer/test_stop_checker.py b/tests/detokenizer/test_stop_checker.py index e9ad8d1612102..bd221977224f9 100644 --- a/tests/detokenizer/test_stop_checker.py +++ b/tests/detokenizer/test_stop_checker.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from unittest.mock import MagicMock diff --git a/tests/detokenizer/test_stop_reason.py b/tests/detokenizer/test_stop_reason.py index 4b1e4f5cf45e8..9716f7d72a585 100644 --- a/tests/detokenizer/test_stop_reason.py +++ b/tests/detokenizer/test_stop_reason.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Test the different finish_reason="stop" situations during generation: 1. One of the provided stop strings 2. One of the provided stop tokens diff --git a/tests/detokenizer/test_stop_strings.py b/tests/detokenizer/test_stop_strings.py index 0607dd01a3395..efe938a20c4f4 100644 --- a/tests/detokenizer/test_stop_strings.py +++ b/tests/detokenizer/test_stop_strings.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any, Optional diff --git a/tests/distributed/conftest.py b/tests/distributed/conftest.py index ee8f2097933d1..95f085788b856 100644 --- a/tests/distributed/conftest.py +++ b/tests/distributed/conftest.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import random from typing import Optional, Union diff --git a/tests/distributed/test_ca_buffer_sharing.py b/tests/distributed/test_ca_buffer_sharing.py index 72e7ebdb7b594..e2de462612b47 100644 --- a/tests/distributed/test_ca_buffer_sharing.py +++ b/tests/distributed/test_ca_buffer_sharing.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # can only run on machines with p2p access across GPUs # can only run with torchrun: diff --git a/tests/distributed/test_comm_ops.py b/tests/distributed/test_comm_ops.py index 8f4c3537e1586..e2cb579e22dc4 100644 --- a/tests/distributed/test_comm_ops.py +++ b/tests/distributed/test_comm_ops.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Test the communication operators. Run `pytest tests/distributed/test_comm_ops.py`. diff --git a/tests/distributed/test_custom_all_reduce.py b/tests/distributed/test_custom_all_reduce.py index a7ba45c9e546e..fae49c41d5f83 100644 --- a/tests/distributed/test_custom_all_reduce.py +++ b/tests/distributed/test_custom_all_reduce.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import random diff --git a/tests/distributed/test_distributed_oot.py b/tests/distributed/test_distributed_oot.py index 4b0c65d1d3a47..b93696e4be0e1 100644 --- a/tests/distributed/test_distributed_oot.py +++ b/tests/distributed/test_distributed_oot.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from ..entrypoints.openai.test_oot_registration import ( run_and_test_dummy_opt_api_server) diff --git a/tests/distributed/test_events.py b/tests/distributed/test_events.py index 8de1aa20eabd0..ec1e5a2d62f11 100644 --- a/tests/distributed/test_events.py +++ b/tests/distributed/test_events.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import threading import time diff --git a/tests/distributed/test_expert_parallel.py b/tests/distributed/test_expert_parallel.py index db82816178030..f641bf1604145 100644 --- a/tests/distributed/test_expert_parallel.py +++ b/tests/distributed/test_expert_parallel.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass from typing import Literal, NamedTuple, Optional diff --git a/tests/distributed/test_multi_node_assignment.py b/tests/distributed/test_multi_node_assignment.py index c86d2d8a0061a..ef17a51fff0e1 100644 --- a/tests/distributed/test_multi_node_assignment.py +++ b/tests/distributed/test_multi_node_assignment.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Make sure ray assigns GPU workers to the correct node. Run: diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index e6410ab068d23..7d569fd83821d 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ WARNING: This test runs in both single-node (4 GPUs) and multi-node (2 node with 2 GPUs each) modes. If the test only uses 2 GPUs, it is diff --git a/tests/distributed/test_pipeline_partition.py b/tests/distributed/test_pipeline_partition.py index 7bf93f270148b..69ceedd345a89 100644 --- a/tests/distributed/test_pipeline_partition.py +++ b/tests/distributed/test_pipeline_partition.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os diff --git a/tests/distributed/test_pp_cudagraph.py b/tests/distributed/test_pp_cudagraph.py index 3ca6e7b33a5ee..a027a9e37dd67 100644 --- a/tests/distributed/test_pp_cudagraph.py +++ b/tests/distributed/test_pp_cudagraph.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from __future__ import annotations from typing import TYPE_CHECKING diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py index 2c323edfa2af2..5b32b90f3cfec 100644 --- a/tests/distributed/test_pynccl.py +++ b/tests/distributed/test_pynccl.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import multiprocessing import os diff --git a/tests/distributed/test_same_node.py b/tests/distributed/test_same_node.py index 9b1bbd6e545c1..94ad8f4f1213a 100644 --- a/tests/distributed/test_same_node.py +++ b/tests/distributed/test_same_node.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os diff --git a/tests/distributed/test_sequence_parallel.py b/tests/distributed/test_sequence_parallel.py index c9eba2b43788e..91a594eac5c42 100644 --- a/tests/distributed/test_sequence_parallel.py +++ b/tests/distributed/test_sequence_parallel.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ WARNING: This test runs in both single-node (4 GPUs) and multi-node (2 node with 2 GPUs each) modes. If the test only uses 2 GPUs, it is diff --git a/tests/distributed/test_shm_broadcast.py b/tests/distributed/test_shm_broadcast.py index f9eacc11d75f8..e1357b4a34e99 100644 --- a/tests/distributed/test_shm_broadcast.py +++ b/tests/distributed/test_shm_broadcast.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import multiprocessing import random diff --git a/tests/distributed/test_torchrun_example.py b/tests/distributed/test_torchrun_example.py index bb38e908b7345..9f2c3eaec3597 100644 --- a/tests/distributed/test_torchrun_example.py +++ b/tests/distributed/test_torchrun_example.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # unit test for `examples/offline_inference/torchrun_example.py` import os diff --git a/tests/distributed/test_utils.py b/tests/distributed/test_utils.py index 4432950f274e0..0287ad94e3886 100644 --- a/tests/distributed/test_utils.py +++ b/tests/distributed/test_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import socket diff --git a/tests/encoder_decoder/test_e2e_correctness.py b/tests/encoder_decoder/test_e2e_correctness.py index 0f46fba3ac49f..8b99d9d6e21fb 100644 --- a/tests/encoder_decoder/test_e2e_correctness.py +++ b/tests/encoder_decoder/test_e2e_correctness.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """E2E tests to verify the correctness of the encoder-decoder framework Run `pytest tests/encoder_decoder/test_e2e_correctness.py`. diff --git a/tests/engine/conftest.py b/tests/engine/conftest.py index 1a20e2c135c2e..375b248ebedaa 100644 --- a/tests/engine/conftest.py +++ b/tests/engine/conftest.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py index 05d9cfc7ab747..ab78aa7da21bd 100644 --- a/tests/engine/test_arg_utils.py +++ b/tests/engine/test_arg_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json from argparse import ArgumentError, ArgumentTypeError diff --git a/tests/engine/test_computed_prefix_blocks.py b/tests/engine/test_computed_prefix_blocks.py index 049fa2c8b12bd..ac5a1f957dfe4 100644 --- a/tests/engine/test_computed_prefix_blocks.py +++ b/tests/engine/test_computed_prefix_blocks.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/engine/test_executor.py b/tests/engine/test_executor.py index 91c9ba4a74e62..15c7a97b50e1f 100644 --- a/tests/engine/test_executor.py +++ b/tests/engine/test_executor.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio import os diff --git a/tests/engine/test_multi_step_output_processor.py b/tests/engine/test_multi_step_output_processor.py index b67dd86bfdf0b..458f4deb743ac 100644 --- a/tests/engine/test_multi_step_output_processor.py +++ b/tests/engine/test_multi_step_output_processor.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import random from unittest.mock import MagicMock diff --git a/tests/engine/test_multiproc_workers.py b/tests/engine/test_multiproc_workers.py index 9b2f45def6c54..b5381b61a020a 100644 --- a/tests/engine/test_multiproc_workers.py +++ b/tests/engine/test_multiproc_workers.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio from concurrent.futures import ThreadPoolExecutor diff --git a/tests/engine/test_options.py b/tests/engine/test_options.py index 0cf4f69d56a87..fc6a78a5112a1 100644 --- a/tests/engine/test_options.py +++ b/tests/engine/test_options.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from contextlib import nullcontext import pytest diff --git a/tests/engine/test_short_mm_context.py b/tests/engine/test_short_mm_context.py index b29d6362f571b..9c62761d78afb 100644 --- a/tests/engine/test_short_mm_context.py +++ b/tests/engine/test_short_mm_context.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/entrypoints/conftest.py b/tests/entrypoints/conftest.py index 3b596ea3e6a0d..a7c533ec24198 100644 --- a/tests/entrypoints/conftest.py +++ b/tests/entrypoints/conftest.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/entrypoints/llm/test_accuracy.py b/tests/entrypoints/llm/test_accuracy.py index 95657455bd7bb..a2d35486a5e81 100644 --- a/tests/entrypoints/llm/test_accuracy.py +++ b/tests/entrypoints/llm/test_accuracy.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This file test accuracy of the vLLM server via LMEval. It uses local-completions, which interacts with vLLM diff --git a/tests/entrypoints/llm/test_chat.py b/tests/entrypoints/llm/test_chat.py index 742a666834457..97cf3b5ce8fcb 100644 --- a/tests/entrypoints/llm/test_chat.py +++ b/tests/entrypoints/llm/test_chat.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import weakref import pytest diff --git a/tests/entrypoints/llm/test_collective_rpc.py b/tests/entrypoints/llm/test_collective_rpc.py index 6470249dddbcf..3a13f8c979f23 100644 --- a/tests/entrypoints/llm/test_collective_rpc.py +++ b/tests/entrypoints/llm/test_collective_rpc.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/entrypoints/llm/test_encode.py b/tests/entrypoints/llm/test_encode.py index d10257761c861..f0fa54aa3131c 100644 --- a/tests/entrypoints/llm/test_encode.py +++ b/tests/entrypoints/llm/test_encode.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import weakref diff --git a/tests/entrypoints/llm/test_generate.py b/tests/entrypoints/llm/test_generate.py index 9a895c922cc39..4676dc992a879 100644 --- a/tests/entrypoints/llm/test_generate.py +++ b/tests/entrypoints/llm/test_generate.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import weakref diff --git a/tests/entrypoints/llm/test_generate_multiple_loras.py b/tests/entrypoints/llm/test_generate_multiple_loras.py index 099af0f36088b..b7d53e31fd71b 100644 --- a/tests/entrypoints/llm/test_generate_multiple_loras.py +++ b/tests/entrypoints/llm/test_generate_multiple_loras.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import weakref diff --git a/tests/entrypoints/llm/test_gpu_utilization.py b/tests/entrypoints/llm/test_gpu_utilization.py index c2b4a935886ba..533da9e6d6eac 100644 --- a/tests/entrypoints/llm/test_gpu_utilization.py +++ b/tests/entrypoints/llm/test_gpu_utilization.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm import LLM, SamplingParams diff --git a/tests/entrypoints/llm/test_guided_generate.py b/tests/entrypoints/llm/test_guided_generate.py index dd5d17885eb91..d41b0a436c62d 100644 --- a/tests/entrypoints/llm/test_guided_generate.py +++ b/tests/entrypoints/llm/test_guided_generate.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json import weakref diff --git a/tests/entrypoints/llm/test_lazy_outlines.py b/tests/entrypoints/llm/test_lazy_outlines.py index f065f6564cd2f..61b6b4fbf8e35 100644 --- a/tests/entrypoints/llm/test_lazy_outlines.py +++ b/tests/entrypoints/llm/test_lazy_outlines.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import sys from contextlib import nullcontext diff --git a/tests/entrypoints/llm/test_prompt_validation.py b/tests/entrypoints/llm/test_prompt_validation.py index 665c6ea1e6994..1b7be15d5d691 100644 --- a/tests/entrypoints/llm/test_prompt_validation.py +++ b/tests/entrypoints/llm/test_prompt_validation.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/entrypoints/offline_mode/test_offline_mode.py b/tests/entrypoints/offline_mode/test_offline_mode.py index 23fd72f4ebbb9..a606eeab5887e 100644 --- a/tests/entrypoints/offline_mode/test_offline_mode.py +++ b/tests/entrypoints/offline_mode/test_offline_mode.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests for HF_HUB_OFFLINE mode""" import importlib import sys diff --git a/tests/entrypoints/openai/correctness/test_lmeval.py b/tests/entrypoints/openai/correctness/test_lmeval.py index d3948e2ed575e..41b70f80e3b83 100644 --- a/tests/entrypoints/openai/correctness/test_lmeval.py +++ b/tests/entrypoints/openai/correctness/test_lmeval.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This file test accuracy of the vLLM server via LMEval. It uses local-completions, which interacts with vLLM diff --git a/tests/entrypoints/openai/correctness/test_mteb.py b/tests/entrypoints/openai/correctness/test_mteb.py index 44d7ac193760f..437c485113520 100644 --- a/tests/entrypoints/openai/correctness/test_mteb.py +++ b/tests/entrypoints/openai/correctness/test_mteb.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os import pytest diff --git a/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py b/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py index 642c204b9ff00..58195f98bd351 100644 --- a/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py +++ b/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Evaluate Transcription API correctness by computing Word Error Rate (WER) on a given ASR dataset. When provided, it will also compare the WER against diff --git a/tests/entrypoints/openai/test_async_tokenization.py b/tests/entrypoints/openai/test_async_tokenization.py index 1f7ba0da4f246..ab3c809054384 100644 --- a/tests/entrypoints/openai/test_async_tokenization.py +++ b/tests/entrypoints/openai/test_async_tokenization.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio import contextlib diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py index 7f959f3120191..d67c05ab3e8de 100644 --- a/tests/entrypoints/openai/test_audio.py +++ b/tests/entrypoints/openai/test_audio.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json diff --git a/tests/entrypoints/openai/test_basic.py b/tests/entrypoints/openai/test_basic.py index a4ac800707734..a55941976cd82 100644 --- a/tests/entrypoints/openai/test_basic.py +++ b/tests/entrypoints/openai/test_basic.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio from http import HTTPStatus diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py index 2509ef0d280a2..dab947b21b284 100644 --- a/tests/entrypoints/openai/test_chat.py +++ b/tests/entrypoints/openai/test_chat.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # imports for guided decoding tests import json diff --git a/tests/entrypoints/openai/test_chat_echo.py b/tests/entrypoints/openai/test_chat_echo.py index 86ee17c6f4491..de63f4ed218b6 100644 --- a/tests/entrypoints/openai/test_chat_echo.py +++ b/tests/entrypoints/openai/test_chat_echo.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import NamedTuple diff --git a/tests/entrypoints/openai/test_chat_logit_bias_validation.py b/tests/entrypoints/openai/test_chat_logit_bias_validation.py index 9dab524ea4801..e9d1a855294cb 100644 --- a/tests/entrypoints/openai/test_chat_logit_bias_validation.py +++ b/tests/entrypoints/openai/test_chat_logit_bias_validation.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import openai import pytest diff --git a/tests/entrypoints/openai/test_chat_template.py b/tests/entrypoints/openai/test_chat_template.py index f18fbb0a9c711..daa4a78c935a7 100644 --- a/tests/entrypoints/openai/test_chat_template.py +++ b/tests/entrypoints/openai/test_chat_template.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/entrypoints/openai/test_chat_with_tool_reasoning.py b/tests/entrypoints/openai/test_chat_with_tool_reasoning.py index e00f001ef730d..03730b67283c4 100644 --- a/tests/entrypoints/openai/test_chat_with_tool_reasoning.py +++ b/tests/entrypoints/openai/test_chat_with_tool_reasoning.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import openai # use the official client for correctness check import pytest diff --git a/tests/entrypoints/openai/test_chunked_prompt.py b/tests/entrypoints/openai/test_chunked_prompt.py index 0419395f1816b..3c8ed955a65a2 100644 --- a/tests/entrypoints/openai/test_chunked_prompt.py +++ b/tests/entrypoints/openai/test_chunked_prompt.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import openai # use the official client for correctness check import pytest diff --git a/tests/entrypoints/openai/test_classification.py b/tests/entrypoints/openai/test_classification.py index 97124c85e0d33..6d5f925152c3c 100644 --- a/tests/entrypoints/openai/test_classification.py +++ b/tests/entrypoints/openai/test_classification.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import requests diff --git a/tests/entrypoints/openai/test_cli_args.py b/tests/entrypoints/openai/test_cli_args.py index 8d1abe28a027a..504fd72aa4ae2 100644 --- a/tests/entrypoints/openai/test_cli_args.py +++ b/tests/entrypoints/openai/test_cli_args.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py index 9d12f27a2b879..7e54143f6e1c3 100644 --- a/tests/entrypoints/openai/test_completion.py +++ b/tests/entrypoints/openai/test_completion.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # imports for guided decoding tests import json import shutil diff --git a/tests/entrypoints/openai/test_completion_with_function_calling.py b/tests/entrypoints/openai/test_completion_with_function_calling.py index dad76b54c5e99..dbea2dc0b0782 100644 --- a/tests/entrypoints/openai/test_completion_with_function_calling.py +++ b/tests/entrypoints/openai/test_completion_with_function_calling.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import openai # use the official client for correctness check import pytest diff --git a/tests/entrypoints/openai/test_completion_with_prompt_embeds.py b/tests/entrypoints/openai/test_completion_with_prompt_embeds.py index b7ee3e33c2d25..00d3ffb61ee9f 100644 --- a/tests/entrypoints/openai/test_completion_with_prompt_embeds.py +++ b/tests/entrypoints/openai/test_completion_with_prompt_embeds.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import base64 import io diff --git a/tests/entrypoints/openai/test_embedding.py b/tests/entrypoints/openai/test_embedding.py index 81ca65b6541a8..80640a2e1a8bc 100644 --- a/tests/entrypoints/openai/test_embedding.py +++ b/tests/entrypoints/openai/test_embedding.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import base64 diff --git a/tests/entrypoints/openai/test_embedding_dimensions.py b/tests/entrypoints/openai/test_embedding_dimensions.py index 341defae0b315..08b797dc57ad2 100644 --- a/tests/entrypoints/openai/test_embedding_dimensions.py +++ b/tests/entrypoints/openai/test_embedding_dimensions.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Run `pytest tests/entrypoints/openai/test_embedding_dimensions.py`. """ diff --git a/tests/entrypoints/openai/test_encoder_decoder.py b/tests/entrypoints/openai/test_encoder_decoder.py index 52b4df9ceecd7..9c2aef23e8772 100644 --- a/tests/entrypoints/openai/test_encoder_decoder.py +++ b/tests/entrypoints/openai/test_encoder_decoder.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import openai import pytest diff --git a/tests/entrypoints/openai/test_lora_adapters.py b/tests/entrypoints/openai/test_lora_adapters.py index cd07ca46ca651..bcdeaaacedea0 100644 --- a/tests/entrypoints/openai/test_lora_adapters.py +++ b/tests/entrypoints/openai/test_lora_adapters.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio import json diff --git a/tests/entrypoints/openai/test_lora_resolvers.py b/tests/entrypoints/openai/test_lora_resolvers.py index c96151349eb3f..d4afdf7751c8f 100644 --- a/tests/entrypoints/openai/test_lora_resolvers.py +++ b/tests/entrypoints/openai/test_lora_resolvers.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from contextlib import suppress from dataclasses import dataclass, field diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py index b21c0173c7b86..2d7b845736b87 100644 --- a/tests/entrypoints/openai/test_metrics.py +++ b/tests/entrypoints/openai/test_metrics.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import subprocess import sys diff --git a/tests/entrypoints/openai/test_models.py b/tests/entrypoints/openai/test_models.py index 3d4f1cde27895..1980daa80db9e 100644 --- a/tests/entrypoints/openai/test_models.py +++ b/tests/entrypoints/openai/test_models.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import openai # use the official client for correctness check import pytest diff --git a/tests/entrypoints/openai/test_oot_registration.py b/tests/entrypoints/openai/test_oot_registration.py index a1b7a205a4575..f0ce50debe494 100644 --- a/tests/entrypoints/openai/test_oot_registration.py +++ b/tests/entrypoints/openai/test_oot_registration.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from ...utils import VLLM_PATH, RemoteOpenAIServer diff --git a/tests/entrypoints/openai/test_openai_schema.py b/tests/entrypoints/openai/test_openai_schema.py index cae2a3b59553d..4ded37595384e 100644 --- a/tests/entrypoints/openai/test_openai_schema.py +++ b/tests/entrypoints/openai/test_openai_schema.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Final import pytest diff --git a/tests/entrypoints/openai/test_pooling.py b/tests/entrypoints/openai/test_pooling.py index 72ab12c564602..cf16ace6537ac 100644 --- a/tests/entrypoints/openai/test_pooling.py +++ b/tests/entrypoints/openai/test_pooling.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import base64 diff --git a/tests/entrypoints/openai/test_prompt_validation.py b/tests/entrypoints/openai/test_prompt_validation.py index e384915899d3d..ff0730c77032c 100644 --- a/tests/entrypoints/openai/test_prompt_validation.py +++ b/tests/entrypoints/openai/test_prompt_validation.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # imports for guided decoding tests import openai diff --git a/tests/entrypoints/openai/test_rerank.py b/tests/entrypoints/openai/test_rerank.py index ba11cd3a29a8e..19eba320c2795 100644 --- a/tests/entrypoints/openai/test_rerank.py +++ b/tests/entrypoints/openai/test_rerank.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import requests diff --git a/tests/entrypoints/openai/test_return_tokens_as_ids.py b/tests/entrypoints/openai/test_return_tokens_as_ids.py index 6474858642d78..099062e55c729 100644 --- a/tests/entrypoints/openai/test_return_tokens_as_ids.py +++ b/tests/entrypoints/openai/test_return_tokens_as_ids.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Separate these tests out from test_completion and test_chat, because they # require launching a second server with a different flag. Running both servers diff --git a/tests/entrypoints/openai/test_root_path.py b/tests/entrypoints/openai/test_root_path.py index 106d6b2c14f83..7b4966848b9de 100644 --- a/tests/entrypoints/openai/test_root_path.py +++ b/tests/entrypoints/openai/test_root_path.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import contextlib import os diff --git a/tests/entrypoints/openai/test_run_batch.py b/tests/entrypoints/openai/test_run_batch.py index 99639ce51aa74..e23f41e983b0d 100644 --- a/tests/entrypoints/openai/test_run_batch.py +++ b/tests/entrypoints/openai/test_run_batch.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json import subprocess diff --git a/tests/entrypoints/openai/test_score.py b/tests/entrypoints/openai/test_score.py index b373f29127524..af51a0a3eeebf 100644 --- a/tests/entrypoints/openai/test_score.py +++ b/tests/entrypoints/openai/test_score.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any import pytest diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index 5e11af8cf8929..94740fefc870e 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio from contextlib import suppress diff --git a/tests/entrypoints/openai/test_serving_models.py b/tests/entrypoints/openai/test_serving_models.py index e8f3c2f8b39ee..28af6489a4d0a 100644 --- a/tests/entrypoints/openai/test_serving_models.py +++ b/tests/entrypoints/openai/test_serving_models.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from http import HTTPStatus from unittest.mock import MagicMock diff --git a/tests/entrypoints/openai/test_shutdown.py b/tests/entrypoints/openai/test_shutdown.py index 0f12ac9b260be..29a94c852bba6 100644 --- a/tests/entrypoints/openai/test_shutdown.py +++ b/tests/entrypoints/openai/test_shutdown.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import openai import pytest diff --git a/tests/entrypoints/openai/test_sleep.py b/tests/entrypoints/openai/test_sleep.py index 3ca8a9a410ffd..0dd6af17ef227 100644 --- a/tests/entrypoints/openai/test_sleep.py +++ b/tests/entrypoints/openai/test_sleep.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import requests diff --git a/tests/entrypoints/openai/test_tensorizer_entrypoint.py b/tests/entrypoints/openai/test_tensorizer_entrypoint.py index f1ab7223048db..e143150356d92 100644 --- a/tests/entrypoints/openai/test_tensorizer_entrypoint.py +++ b/tests/entrypoints/openai/test_tensorizer_entrypoint.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import gc import json import tempfile diff --git a/tests/entrypoints/openai/test_tokenization.py b/tests/entrypoints/openai/test_tokenization.py index 7d823542e3744..57dd25fe1b164 100644 --- a/tests/entrypoints/openai/test_tokenization.py +++ b/tests/entrypoints/openai/test_tokenization.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import pytest_asyncio diff --git a/tests/entrypoints/openai/test_transcription_validation.py b/tests/entrypoints/openai/test_transcription_validation.py index 5c48df3cebbc2..1cb0a39df5139 100644 --- a/tests/entrypoints/openai/test_transcription_validation.py +++ b/tests/entrypoints/openai/test_transcription_validation.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # imports for guided decoding tests import io diff --git a/tests/entrypoints/openai/test_truncation.py b/tests/entrypoints/openai/test_truncation.py index 137ed9db85891..b33a26af65b33 100644 --- a/tests/entrypoints/openai/test_truncation.py +++ b/tests/entrypoints/openai/test_truncation.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any import openai diff --git a/tests/entrypoints/openai/test_video.py b/tests/entrypoints/openai/test_video.py index 53f057a294c0a..990ea3579291d 100644 --- a/tests/entrypoints/openai/test_video.py +++ b/tests/entrypoints/openai/test_video.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py index 1ab50b41c7ecb..4513d8b3420f4 100644 --- a/tests/entrypoints/openai/test_vision.py +++ b/tests/entrypoints/openai/test_vision.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json diff --git a/tests/entrypoints/openai/test_vision_embedding.py b/tests/entrypoints/openai/test_vision_embedding.py index 26c68e06c199f..fe982e286ae47 100644 --- a/tests/entrypoints/openai/test_vision_embedding.py +++ b/tests/entrypoints/openai/test_vision_embedding.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json diff --git a/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py index f5f327ea068c6..8c86b4889e15b 100644 --- a/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py +++ b/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from unittest.mock import MagicMock, patch diff --git a/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py index 71f41ea7d93b4..d83137472598e 100644 --- a/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py +++ b/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from unittest.mock import MagicMock, patch diff --git a/tests/entrypoints/openai/tool_parsers/utils.py b/tests/entrypoints/openai/tool_parsers/utils.py index ab8f4bd678fdf..e1b41f45f5548 100644 --- a/tests/entrypoints/openai/tool_parsers/utils.py +++ b/tests/entrypoints/openai/tool_parsers/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable from typing import Union diff --git a/tests/entrypoints/test_api_server_process_manager.py b/tests/entrypoints/test_api_server_process_manager.py index 0dd1fdd996948..e4af60a782651 100644 --- a/tests/entrypoints/test_api_server_process_manager.py +++ b/tests/entrypoints/test_api_server_process_manager.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import multiprocessing import socket diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py index 9f1f2321d9e64..49294664275a0 100644 --- a/tests/entrypoints/test_chat_utils.py +++ b/tests/entrypoints/test_chat_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import warnings from typing import Optional diff --git a/tests/entrypoints/test_ssl_cert_refresher.py b/tests/entrypoints/test_ssl_cert_refresher.py index 23ce7a679f3ea..33ad2cfd3a33a 100644 --- a/tests/entrypoints/test_ssl_cert_refresher.py +++ b/tests/entrypoints/test_ssl_cert_refresher.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio import tempfile from pathlib import Path diff --git a/tests/fastsafetensors_loader/test_fastsafetensors_loader.py b/tests/fastsafetensors_loader/test_fastsafetensors_loader.py index 184bee2a7153a..1b95bf59f67c6 100644 --- a/tests/fastsafetensors_loader/test_fastsafetensors_loader.py +++ b/tests/fastsafetensors_loader/test_fastsafetensors_loader.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm import SamplingParams from vllm.config import LoadFormat diff --git a/tests/fastsafetensors_loader/test_weight_utils.py b/tests/fastsafetensors_loader/test_weight_utils.py index 8772035af502f..78d23acfec7c5 100644 --- a/tests/fastsafetensors_loader/test_weight_utils.py +++ b/tests/fastsafetensors_loader/test_weight_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import glob import tempfile diff --git a/tests/kernels/allclose_default.py b/tests/kernels/allclose_default.py index 97ceffab4eb88..9d65159bf64fe 100644 --- a/tests/kernels/allclose_default.py +++ b/tests/kernels/allclose_default.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch diff --git a/tests/kernels/attention/conftest.py b/tests/kernels/attention/conftest.py index 4f04ec9475329..88a2fb62b2540 100644 --- a/tests/kernels/attention/conftest.py +++ b/tests/kernels/attention/conftest.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/kernels/attention/test_attention.py b/tests/kernels/attention/test_attention.py index d9f956fbc7c00..2d381a99be60c 100644 --- a/tests/kernels/attention/test_attention.py +++ b/tests/kernels/attention/test_attention.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import random from typing import Optional diff --git a/tests/kernels/attention/test_attention_selector.py b/tests/kernels/attention/test_attention_selector.py index 58da01f0ebbf3..435fe62256140 100644 --- a/tests/kernels/attention/test_attention_selector.py +++ b/tests/kernels/attention/test_attention_selector.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from unittest.mock import patch diff --git a/tests/kernels/attention/test_blocksparse_attention.py b/tests/kernels/attention/test_blocksparse_attention.py index 82d038257575c..9aee818c99569 100644 --- a/tests/kernels/attention/test_blocksparse_attention.py +++ b/tests/kernels/attention/test_blocksparse_attention.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import random from typing import Optional diff --git a/tests/kernels/attention/test_cache.py b/tests/kernels/attention/test_cache.py index 2f2212dd2b0e0..e508505c2b05d 100644 --- a/tests/kernels/attention/test_cache.py +++ b/tests/kernels/attention/test_cache.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import random diff --git a/tests/kernels/attention/test_cascade_flash_attn.py b/tests/kernels/attention/test_cascade_flash_attn.py index d6570e6334b16..1e7e7e0a7f84b 100755 --- a/tests/kernels/attention/test_cascade_flash_attn.py +++ b/tests/kernels/attention/test_cascade_flash_attn.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/tests/kernels/attention/test_encoder_decoder_attn.py b/tests/kernels/attention/test_encoder_decoder_attn.py index c8ee46bc65d4d..c6ce7b0cce40d 100644 --- a/tests/kernels/attention/test_encoder_decoder_attn.py +++ b/tests/kernels/attention/test_encoder_decoder_attn.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Tests: diff --git a/tests/kernels/attention/test_flash_attn.py b/tests/kernels/attention/test_flash_attn.py index 88516b75cde2b..bd3190d09b0fa 100644 --- a/tests/kernels/attention/test_flash_attn.py +++ b/tests/kernels/attention/test_flash_attn.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/tests/kernels/attention/test_flashinfer.py b/tests/kernels/attention/test_flashinfer.py index 5ad1137aa6af7..3ad6e1d32911b 100644 --- a/tests/kernels/attention/test_flashinfer.py +++ b/tests/kernels/attention/test_flashinfer.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/tests/kernels/attention/test_flashmla.py b/tests/kernels/attention/test_flashmla.py index 0d51a8e7fee19..21b08e45fd6fd 100644 --- a/tests/kernels/attention/test_flashmla.py +++ b/tests/kernels/attention/test_flashmla.py @@ -1,5 +1,6 @@ # Adapted from: https://github.com/deepseek-ai/FlashMLA/blob/main/tests/test_flash_mla.py # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import math import random diff --git a/tests/kernels/attention/test_lightning_attn.py b/tests/kernels/attention/test_lightning_attn.py index fbad52987dd2b..de45ee1ed5cca 100644 --- a/tests/kernels/attention/test_lightning_attn.py +++ b/tests/kernels/attention/test_lightning_attn.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch diff --git a/tests/kernels/attention/test_merge_attn_states.py b/tests/kernels/attention/test_merge_attn_states.py index 7038fbea5c22e..9d1a301ebe304 100644 --- a/tests/kernels/attention/test_merge_attn_states.py +++ b/tests/kernels/attention/test_merge_attn_states.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional import pytest diff --git a/tests/kernels/attention/test_mha_attn.py b/tests/kernels/attention/test_mha_attn.py index 5a18b7916f0f6..53c37554b15a3 100644 --- a/tests/kernels/attention/test_mha_attn.py +++ b/tests/kernels/attention/test_mha_attn.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Test: diff --git a/tests/kernels/attention/test_mla_decode_cpu.py b/tests/kernels/attention/test_mla_decode_cpu.py index 8cebe32c4c5bb..5a7480a6beaea 100644 --- a/tests/kernels/attention/test_mla_decode_cpu.py +++ b/tests/kernels/attention/test_mla_decode_cpu.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch import torch.nn.functional as F diff --git a/tests/kernels/attention/test_prefix_prefill.py b/tests/kernels/attention/test_prefix_prefill.py index 9333777d38ea0..b09e1bbc42794 100644 --- a/tests/kernels/attention/test_prefix_prefill.py +++ b/tests/kernels/attention/test_prefix_prefill.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import math import random diff --git a/tests/kernels/attention/test_rocm_attention_selector.py b/tests/kernels/attention/test_rocm_attention_selector.py index 6ffe27abf709e..ed58880cc9e6c 100644 --- a/tests/kernels/attention/test_rocm_attention_selector.py +++ b/tests/kernels/attention/test_rocm_attention_selector.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch diff --git a/tests/kernels/attention/test_triton_decode_attention.py b/tests/kernels/attention/test_triton_decode_attention.py index fd3c9fa4196a7..358b374ea75bc 100644 --- a/tests/kernels/attention/test_triton_decode_attention.py +++ b/tests/kernels/attention/test_triton_decode_attention.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch diff --git a/tests/kernels/attention/test_triton_unified_attention.py b/tests/kernels/attention/test_triton_unified_attention.py index be3d1879de24b..0cb7f5963c79b 100644 --- a/tests/kernels/attention/test_triton_unified_attention.py +++ b/tests/kernels/attention/test_triton_unified_attention.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/tests/kernels/core/test_activation.py b/tests/kernels/core/test_activation.py index 79f838a954e70..29c5e70a8ba85 100644 --- a/tests/kernels/core/test_activation.py +++ b/tests/kernels/core/test_activation.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import random diff --git a/tests/kernels/core/test_fused_quant_layernorm.py b/tests/kernels/core/test_fused_quant_layernorm.py index 7a591f5367834..19703b8a2f978 100644 --- a/tests/kernels/core/test_fused_quant_layernorm.py +++ b/tests/kernels/core/test_fused_quant_layernorm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional, Union diff --git a/tests/kernels/core/test_layernorm.py b/tests/kernels/core/test_layernorm.py index fa4bbe458645f..3eac062738f80 100644 --- a/tests/kernels/core/test_layernorm.py +++ b/tests/kernels/core/test_layernorm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch diff --git a/tests/kernels/core/test_opcheck.py b/tests/kernels/core/test_opcheck.py index c9a9679c5d80f..40ced08b933a7 100644 --- a/tests/kernels/core/test_opcheck.py +++ b/tests/kernels/core/test_opcheck.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Tests for miscellaneous utilities """ diff --git a/tests/kernels/core/test_permute_cols.py b/tests/kernels/core/test_permute_cols.py index 35d62079fb65d..e18f6230dbcea 100644 --- a/tests/kernels/core/test_permute_cols.py +++ b/tests/kernels/core/test_permute_cols.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch diff --git a/tests/kernels/core/test_pos_encoding.py b/tests/kernels/core/test_pos_encoding.py index 8cb56314cf94a..ab6f1ccf881fd 100644 --- a/tests/kernels/core/test_pos_encoding.py +++ b/tests/kernels/core/test_pos_encoding.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from itertools import accumulate, product from typing import Callable, Optional diff --git a/tests/kernels/core/test_rotary_embedding.py b/tests/kernels/core/test_rotary_embedding.py index 8383f943b9fa4..db0fdcbf5ef22 100644 --- a/tests/kernels/core/test_rotary_embedding.py +++ b/tests/kernels/core/test_rotary_embedding.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Tests for miscellaneous utilities """ diff --git a/tests/kernels/core/test_uva.py b/tests/kernels/core/test_uva.py index f641ae7b67c2d..c71215e4c646b 100644 --- a/tests/kernels/core/test_uva.py +++ b/tests/kernels/core/test_uva.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch diff --git a/tests/kernels/mamba/test_causal_conv1d.py b/tests/kernels/mamba/test_causal_conv1d.py index 93064e23dd7d1..addb8bfcda137 100644 --- a/tests/kernels/mamba/test_causal_conv1d.py +++ b/tests/kernels/mamba/test_causal_conv1d.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/tests/kernels/mamba/test_mamba_mixer2.py b/tests/kernels/mamba/test_mamba_mixer2.py index abcf3888fea26..f5c6a18614ff7 100644 --- a/tests/kernels/mamba/test_mamba_mixer2.py +++ b/tests/kernels/mamba/test_mamba_mixer2.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import unittest diff --git a/tests/kernels/mamba/test_mamba_ssm.py b/tests/kernels/mamba/test_mamba_ssm.py index 84d4c347e0d81..8dece26ddb29c 100644 --- a/tests/kernels/mamba/test_mamba_ssm.py +++ b/tests/kernels/mamba/test_mamba_ssm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch diff --git a/tests/kernels/mamba/test_mamba_ssm_ssd.py b/tests/kernels/mamba/test_mamba_ssm_ssd.py index f5e751bea4149..abed1252a3ce6 100644 --- a/tests/kernels/mamba/test_mamba_ssm_ssd.py +++ b/tests/kernels/mamba/test_mamba_ssm_ssd.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch diff --git a/tests/kernels/moe/test_batched_moe.py b/tests/kernels/moe/test_batched_moe.py index 7d369edfc86a4..b0e0feab4689b 100644 --- a/tests/kernels/moe/test_batched_moe.py +++ b/tests/kernels/moe/test_batched_moe.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass diff --git a/tests/kernels/moe/test_cutlass_moe.py b/tests/kernels/moe/test_cutlass_moe.py index 7db4fe0f46e3f..558288ba44d72 100644 --- a/tests/kernels/moe/test_cutlass_moe.py +++ b/tests/kernels/moe/test_cutlass_moe.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import dataclasses from typing import Optional diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py index 299279390fe0c..7238813a299d6 100644 --- a/tests/kernels/moe/test_moe.py +++ b/tests/kernels/moe/test_moe.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests for the MOE layers. Run `pytest tests/kernels/test_moe.py`. diff --git a/tests/kernels/moe/test_moe_permute_unpermute.py b/tests/kernels/moe/test_moe_permute_unpermute.py index 10e6ac64df877..7cc83b512c8b9 100644 --- a/tests/kernels/moe/test_moe_permute_unpermute.py +++ b/tests/kernels/moe/test_moe_permute_unpermute.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests for the MOE permute/unpermute kernel Run `pytest tests/kernels/test_moe_permute_unpermute.py`. diff --git a/tests/kernels/moe/test_nvfp4_moe.py b/tests/kernels/moe/test_nvfp4_moe.py index ae63b379f39d1..be33200cc2069 100644 --- a/tests/kernels/moe/test_nvfp4_moe.py +++ b/tests/kernels/moe/test_nvfp4_moe.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch diff --git a/tests/kernels/moe/test_pplx_moe.py b/tests/kernels/moe/test_pplx_moe.py index 8c4a2c3fa440f..95c10037b233c 100644 --- a/tests/kernels/moe/test_pplx_moe.py +++ b/tests/kernels/moe/test_pplx_moe.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests for the MOE layers. Run `pytest tests/kernels/test_pplx_moe.py`. diff --git a/tests/kernels/moe/test_rocm_aiter_topk.py b/tests/kernels/moe/test_rocm_aiter_topk.py index 922fd66dbef49..1c51c530c193c 100644 --- a/tests/kernels/moe/test_rocm_aiter_topk.py +++ b/tests/kernels/moe/test_rocm_aiter_topk.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # This is a test for the AITER ops. # It tests if the AITER ops are # 1. correctly registered as custom ops diff --git a/tests/kernels/moe/test_triton_moe_ptpc_fp8.py b/tests/kernels/moe/test_triton_moe_ptpc_fp8.py index 3b5838a99fa15..dfd0f35c8da3d 100644 --- a/tests/kernels/moe/test_triton_moe_ptpc_fp8.py +++ b/tests/kernels/moe/test_triton_moe_ptpc_fp8.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from https://github.com/sgl-project/sglang/blob/main/test/srt/test_triton_moe_channel_fp8_kernel.py import itertools diff --git a/tests/kernels/quant_utils.py b/tests/kernels/quant_utils.py index 892309a017e43..0840cc7b54fcb 100644 --- a/tests/kernels/quant_utils.py +++ b/tests/kernels/quant_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional, Union diff --git a/tests/kernels/quantization/nvfp4_utils.py b/tests/kernels/quantization/nvfp4_utils.py index 58eaeee1c0b88..1095975ab2b41 100644 --- a/tests/kernels/quantization/nvfp4_utils.py +++ b/tests/kernels/quantization/nvfp4_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch from vllm.scalar_type import scalar_types diff --git a/tests/kernels/quantization/test_allspark_gemm.py b/tests/kernels/quantization/test_allspark_gemm.py index 896e0265738b7..3de9cb3644684 100644 --- a/tests/kernels/quantization/test_allspark_gemm.py +++ b/tests/kernels/quantization/test_allspark_gemm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch diff --git a/tests/kernels/quantization/test_aqlm.py b/tests/kernels/quantization/test_aqlm.py index 7d36172815b78..427db3e602921 100644 --- a/tests/kernels/quantization/test_aqlm.py +++ b/tests/kernels/quantization/test_aqlm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch diff --git a/tests/kernels/quantization/test_awq.py b/tests/kernels/quantization/test_awq.py index 248b294e546b3..bc0868123d82a 100644 --- a/tests/kernels/quantization/test_awq.py +++ b/tests/kernels/quantization/test_awq.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch diff --git a/tests/kernels/quantization/test_awq_triton.py b/tests/kernels/quantization/test_awq_triton.py index 3fc3feaf4972c..96797e85bd125 100644 --- a/tests/kernels/quantization/test_awq_triton.py +++ b/tests/kernels/quantization/test_awq_triton.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests for the AWQ Triton kernel. Run `pytest tests/kernels/test_awq_triton.py`. diff --git a/tests/kernels/quantization/test_block_fp8.py b/tests/kernels/quantization/test_block_fp8.py index ae05d61173f33..8c5ee98743d72 100644 --- a/tests/kernels/quantization/test_block_fp8.py +++ b/tests/kernels/quantization/test_block_fp8.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from https://github.com/sgl-project/sglang/pull/2575 import itertools diff --git a/tests/kernels/quantization/test_block_int8.py b/tests/kernels/quantization/test_block_int8.py index a4e9f83f0eaf1..fa2c9f890d6fb 100644 --- a/tests/kernels/quantization/test_block_int8.py +++ b/tests/kernels/quantization/test_block_int8.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from https://github.com/sgl-project/sglang/blob/main/test/srt/test_block_int8.py import itertools diff --git a/tests/kernels/quantization/test_cutlass_2of4_sparse.py b/tests/kernels/quantization/test_cutlass_2of4_sparse.py index d67d2dbb89981..878f66647e19e 100644 --- a/tests/kernels/quantization/test_cutlass_2of4_sparse.py +++ b/tests/kernels/quantization/test_cutlass_2of4_sparse.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests for sparse cutlass kernels Run `pytest tests/kernels/test_semi_structured.py`. diff --git a/tests/kernels/quantization/test_cutlass_scaled_mm.py b/tests/kernels/quantization/test_cutlass_scaled_mm.py index 633addd421f43..51bb29df054e5 100644 --- a/tests/kernels/quantization/test_cutlass_scaled_mm.py +++ b/tests/kernels/quantization/test_cutlass_scaled_mm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests for cutlass kernels Run `pytest tests/kernels/test_cutlass.py`. diff --git a/tests/kernels/quantization/test_fp8_quant.py b/tests/kernels/quantization/test_fp8_quant.py index 876cf03fd644c..0a3edd4ddc16a 100644 --- a/tests/kernels/quantization/test_fp8_quant.py +++ b/tests/kernels/quantization/test_fp8_quant.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch diff --git a/tests/kernels/quantization/test_ggml.py b/tests/kernels/quantization/test_ggml.py index 73697a6d1242d..07651fef39bf4 100644 --- a/tests/kernels/quantization/test_ggml.py +++ b/tests/kernels/quantization/test_ggml.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import gguf import pytest diff --git a/tests/kernels/quantization/test_gguf.py b/tests/kernels/quantization/test_gguf.py index ad755fe7f7a0b..436d5cb640219 100644 --- a/tests/kernels/quantization/test_gguf.py +++ b/tests/kernels/quantization/test_gguf.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from pathlib import Path diff --git a/tests/kernels/quantization/test_gptq.py b/tests/kernels/quantization/test_gptq.py index fea013d9e5795..7fb57a1576bd8 100644 --- a/tests/kernels/quantization/test_gptq.py +++ b/tests/kernels/quantization/test_gptq.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch diff --git a/tests/kernels/quantization/test_int8_kernel.py b/tests/kernels/quantization/test_int8_kernel.py index 4c7543527c323..dc5fecbf4ccc8 100644 --- a/tests/kernels/quantization/test_int8_kernel.py +++ b/tests/kernels/quantization/test_int8_kernel.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from https://github.com/sgl-project/sglang/blob/main/test/srt/test_int8_kernel.py import itertools diff --git a/tests/kernels/quantization/test_int8_quant.py b/tests/kernels/quantization/test_int8_quant.py index 25dcb587e4878..63ccf4a917369 100644 --- a/tests/kernels/quantization/test_int8_quant.py +++ b/tests/kernels/quantization/test_int8_quant.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch diff --git a/tests/kernels/quantization/test_machete_mm.py b/tests/kernels/quantization/test_machete_mm.py index 5aeaaa654ed60..998171baaf2de 100644 --- a/tests/kernels/quantization/test_machete_mm.py +++ b/tests/kernels/quantization/test_machete_mm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests for the machete kernel. Run `pytest tests/kernels/test_machete_mm.py`. diff --git a/tests/kernels/quantization/test_marlin_gemm.py b/tests/kernels/quantization/test_marlin_gemm.py index 52507b375c271..92914bd5cbba7 100644 --- a/tests/kernels/quantization/test_marlin_gemm.py +++ b/tests/kernels/quantization/test_marlin_gemm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests for the marlin kernel. Run `pytest tests/kernels/marlin/test_marlin_gemm.py`. diff --git a/tests/kernels/quantization/test_nvfp4_quant.py b/tests/kernels/quantization/test_nvfp4_quant.py index b8aa1672100e2..3a8f4c17598c2 100644 --- a/tests/kernels/quantization/test_nvfp4_quant.py +++ b/tests/kernels/quantization/test_nvfp4_quant.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch diff --git a/tests/kernels/quantization/test_nvfp4_scaled_mm.py b/tests/kernels/quantization/test_nvfp4_scaled_mm.py index 1f49900b2d90b..0b45c22981752 100644 --- a/tests/kernels/quantization/test_nvfp4_scaled_mm.py +++ b/tests/kernels/quantization/test_nvfp4_scaled_mm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch from nvfp4_utils import (FLOAT4_E2M1_MAX, FLOAT8_E4M3_MAX, diff --git a/tests/kernels/quantization/test_rocm_skinny_gemms.py b/tests/kernels/quantization/test_rocm_skinny_gemms.py index c7eee899896ac..533a4fe596779 100644 --- a/tests/kernels/quantization/test_rocm_skinny_gemms.py +++ b/tests/kernels/quantization/test_rocm_skinny_gemms.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch diff --git a/tests/kernels/quantization/test_triton_scaled_mm.py b/tests/kernels/quantization/test_triton_scaled_mm.py index 30e6eeb8d5660..8a2cc3baced23 100644 --- a/tests/kernels/quantization/test_triton_scaled_mm.py +++ b/tests/kernels/quantization/test_triton_scaled_mm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests for the triton_scaled_mm kernel Run `pytest tests/kernels/test_triton_scaled_mm.py`. diff --git a/tests/kernels/test_cutlass_mla_decode.py b/tests/kernels/test_cutlass_mla_decode.py index 87e4bd4b096b3..c56024b757e14 100644 --- a/tests/kernels/test_cutlass_mla_decode.py +++ b/tests/kernels/test_cutlass_mla_decode.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch import torch.nn.functional as F diff --git a/tests/kernels/test_fused_quant_activation.py b/tests/kernels/test_fused_quant_activation.py index faa8d49ce41be..803453a20d81d 100644 --- a/tests/kernels/test_fused_quant_activation.py +++ b/tests/kernels/test_fused_quant_activation.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch diff --git a/tests/kernels/test_triton_flash_attention.py b/tests/kernels/test_triton_flash_attention.py index cf2bdc908e420..1c31cfb25e5ac 100644 --- a/tests/kernels/test_triton_flash_attention.py +++ b/tests/kernels/test_triton_flash_attention.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests for the triton_flash_attention kernel Run `pytest tests/kernels/test_triton_flash_attention.py`. diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py index 22b3d7c2be7a5..d1db6a8eb1ba4 100644 --- a/tests/kernels/utils.py +++ b/tests/kernels/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Kernel test utils""" import itertools diff --git a/tests/kv_transfer/test_disagg.py b/tests/kv_transfer/test_disagg.py index dc948a48bf326..9f2229cc41dff 100644 --- a/tests/kv_transfer/test_disagg.py +++ b/tests/kv_transfer/test_disagg.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os import subprocess diff --git a/tests/kv_transfer/test_lookup_buffer.py b/tests/kv_transfer/test_lookup_buffer.py index c5b34660d1658..352ab63552de7 100644 --- a/tests/kv_transfer/test_lookup_buffer.py +++ b/tests/kv_transfer/test_lookup_buffer.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os import random diff --git a/tests/kv_transfer/test_module.py b/tests/kv_transfer/test_module.py index 8a6490b5c8876..7a04174870daf 100644 --- a/tests/kv_transfer/test_module.py +++ b/tests/kv_transfer/test_module.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import subprocess import sys diff --git a/tests/kv_transfer/test_send_recv.py b/tests/kv_transfer/test_send_recv.py index 3dd923d24050c..32116608a2177 100644 --- a/tests/kv_transfer/test_send_recv.py +++ b/tests/kv_transfer/test_send_recv.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os import time diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py index 399311ce65bb8..0737bb886e43e 100644 --- a/tests/lora/conftest.py +++ b/tests/lora/conftest.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import tempfile from collections import OrderedDict diff --git a/tests/lora/test_add_lora.py b/tests/lora/test_add_lora.py index 17347300b40c8..cc8160b2860d9 100644 --- a/tests/lora/test_add_lora.py +++ b/tests/lora/test_add_lora.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio import time diff --git a/tests/lora/test_baichuan.py b/tests/lora/test_baichuan.py index 007be7aa582ea..774ebb9db2106 100644 --- a/tests/lora/test_baichuan.py +++ b/tests/lora/test_baichuan.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/lora/test_chatglm3_tp.py b/tests/lora/test_chatglm3_tp.py index cd9526c8b1012..5481b413b8f5f 100644 --- a/tests/lora/test_chatglm3_tp.py +++ b/tests/lora/test_chatglm3_tp.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import vllm from vllm.lora.request import LoRARequest diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py index 0a8b38fa748a6..92db023babc28 100644 --- a/tests/lora/test_layers.py +++ b/tests/lora/test_layers.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import random from copy import deepcopy diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py index 54daea5b9dbf0..23819f03dc51f 100644 --- a/tests/lora/test_llama_tp.py +++ b/tests/lora/test_llama_tp.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import subprocess import sys from typing import Union diff --git a/tests/lora/test_lora_allowed_token_ids.py b/tests/lora/test_lora_allowed_token_ids.py index 094541aef02bb..01bc102bd112b 100644 --- a/tests/lora/test_lora_allowed_token_ids.py +++ b/tests/lora/test_lora_allowed_token_ids.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/lora/test_lora_checkpoints.py b/tests/lora/test_lora_checkpoints.py index 02f2339bef01d..ebc0f26378d27 100644 --- a/tests/lora/test_lora_checkpoints.py +++ b/tests/lora/test_lora_checkpoints.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/lora/test_lora_functions.py b/tests/lora/test_lora_functions.py index fd80f61a59773..e9a52e1b63573 100644 --- a/tests/lora/test_lora_functions.py +++ b/tests/lora/test_lora_functions.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Script to test add_lora, remove_lora, pin_lora, list_loras functions. """ diff --git a/tests/lora/test_lora_huggingface.py b/tests/lora/test_lora_huggingface.py index 90498c47fb104..b46d81f1651a6 100644 --- a/tests/lora/test_lora_huggingface.py +++ b/tests/lora/test_lora_huggingface.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py index 52b0834cacb85..8f8a27006cf67 100644 --- a/tests/lora/test_lora_manager.py +++ b/tests/lora/test_lora_manager.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os diff --git a/tests/lora/test_minicpmv_tp.py b/tests/lora/test_minicpmv_tp.py index 24242b8a17594..99fe951bbf070 100644 --- a/tests/lora/test_minicpmv_tp.py +++ b/tests/lora/test_minicpmv_tp.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/lora/test_mixtral.py b/tests/lora/test_mixtral.py index 4e77c5559e164..0ea07793311cb 100644 --- a/tests/lora/test_mixtral.py +++ b/tests/lora/test_mixtral.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch diff --git a/tests/lora/test_peft_helper.py b/tests/lora/test_peft_helper.py index 9935472ad18f4..f16589e06b2dc 100644 --- a/tests/lora/test_peft_helper.py +++ b/tests/lora/test_peft_helper.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json import math diff --git a/tests/lora/test_phi.py b/tests/lora/test_phi.py index 7375cabbc36d9..a21de070517b1 100644 --- a/tests/lora/test_phi.py +++ b/tests/lora/test_phi.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/lora/test_punica_ops.py b/tests/lora/test_punica_ops.py index add313c945446..14fa79ae5b446 100644 --- a/tests/lora/test_punica_ops.py +++ b/tests/lora/test_punica_ops.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from threading import Lock import pytest diff --git a/tests/lora/test_quant_model.py b/tests/lora/test_quant_model.py index 43e2975cd87c0..caa31fdb0e73e 100644 --- a/tests/lora/test_quant_model.py +++ b/tests/lora/test_quant_model.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/fmmoret/vllm/blob/fm-support-lora-on-quantized-models/tests/lora/test_llama.py diff --git a/tests/lora/test_qwen2vl.py b/tests/lora/test_qwen2vl.py index 20a1ae67db2dc..604bb307b889d 100644 --- a/tests/lora/test_qwen2vl.py +++ b/tests/lora/test_qwen2vl.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass from typing import Optional diff --git a/tests/lora/test_resolver.py b/tests/lora/test_resolver.py index 8ebc2ae98fc43..6c93e577611f8 100644 --- a/tests/lora/test_resolver.py +++ b/tests/lora/test_resolver.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/tests/lora/test_tokenizer_group.py b/tests/lora/test_tokenizer_group.py index 8845eb33d207e..6cfdaf50d33c4 100644 --- a/tests/lora/test_tokenizer_group.py +++ b/tests/lora/test_tokenizer_group.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest from transformers import AutoTokenizer, PreTrainedTokenizerBase diff --git a/tests/lora/test_transfomers_model.py b/tests/lora/test_transfomers_model.py index 63907f2c1d02c..5065a2fb71649 100644 --- a/tests/lora/test_transfomers_model.py +++ b/tests/lora/test_transfomers_model.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/lora/test_utils.py b/tests/lora/test_utils.py index 0d4e0bf681f2c..b343bef0a920b 100644 --- a/tests/lora/test_utils.py +++ b/tests/lora/test_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections import OrderedDict from typing import NamedTuple, Optional diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py index 1a5d527164d0b..6f13e663a78bb 100644 --- a/tests/lora/test_worker.py +++ b/tests/lora/test_worker.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os import random diff --git a/tests/lora/utils.py b/tests/lora/utils.py index 59a0e7420fc25..cc1b0d81955bc 100644 --- a/tests/lora/utils.py +++ b/tests/lora/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass from typing import Optional, Union diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py index e71c87ff3fc82..7bb5d8980d614 100644 --- a/tests/metrics/test_metrics.py +++ b/tests/metrics/test_metrics.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import time diff --git a/tests/mistral_tool_use/conftest.py b/tests/mistral_tool_use/conftest.py index 39ab01c9b8741..e89e60c5a02ec 100644 --- a/tests/mistral_tool_use/conftest.py +++ b/tests/mistral_tool_use/conftest.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import pytest_asyncio diff --git a/tests/mistral_tool_use/test_mistral_tool_calls.py b/tests/mistral_tool_use/test_mistral_tool_calls.py index bbb3a07895f6c..9bf6863f3f2b7 100644 --- a/tests/mistral_tool_use/test_mistral_tool_calls.py +++ b/tests/mistral_tool_use/test_mistral_tool_calls.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import openai import pytest diff --git a/tests/mistral_tool_use/utils.py b/tests/mistral_tool_use/utils.py index 1d809a05e89d1..7a026cd9bb619 100644 --- a/tests/mistral_tool_use/utils.py +++ b/tests/mistral_tool_use/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/tests/model_executor/conftest.py b/tests/model_executor/conftest.py index b588a1a96638b..c6d89d849e9f9 100644 --- a/tests/model_executor/conftest.py +++ b/tests/model_executor/conftest.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/model_executor/test_enabled_custom_ops.py b/tests/model_executor/test_enabled_custom_ops.py index e957db5b3f16a..a94215ee397bf 100644 --- a/tests/model_executor/test_enabled_custom_ops.py +++ b/tests/model_executor/test_enabled_custom_ops.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch diff --git a/tests/model_executor/test_guided_processors.py b/tests/model_executor/test_guided_processors.py index 6cd966f84802b..ac31064d92120 100644 --- a/tests/model_executor/test_guided_processors.py +++ b/tests/model_executor/test_guided_processors.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json import pickle diff --git a/tests/model_executor/test_logits_processor.py b/tests/model_executor/test_logits_processor.py index 8301c645b79f8..532ebba038d38 100644 --- a/tests/model_executor/test_logits_processor.py +++ b/tests/model_executor/test_logits_processor.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import random from unittest.mock import patch diff --git a/tests/model_executor/test_model_load_with_params.py b/tests/model_executor/test_model_load_with_params.py index 7fda1f0e80d07..94a14bd24bcb6 100644 --- a/tests/model_executor/test_model_load_with_params.py +++ b/tests/model_executor/test_model_load_with_params.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os diff --git a/tests/model_executor/test_weight_utils.py b/tests/model_executor/test_weight_utils.py index bdaba22c3c7a8..df625b8d60049 100644 --- a/tests/model_executor/test_weight_utils.py +++ b/tests/model_executor/test_weight_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os import tempfile diff --git a/tests/models/language/generation/test_bart.py b/tests/models/language/generation/test_bart.py index 8ab0167dc771d..7d8acab5e8343 100644 --- a/tests/models/language/generation/test_bart.py +++ b/tests/models/language/generation/test_bart.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional import pytest diff --git a/tests/models/language/generation/test_common.py b/tests/models/language/generation/test_common.py index 05dd18fbdf8b3..ed9e547225149 100644 --- a/tests/models/language/generation/test_common.py +++ b/tests/models/language/generation/test_common.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os from typing import Optional diff --git a/tests/models/language/generation/test_granite.py b/tests/models/language/generation/test_granite.py index f381c34f44b8c..2a39f78a708ee 100644 --- a/tests/models/language/generation/test_granite.py +++ b/tests/models/language/generation/test_granite.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest from ...utils import check_logprobs_close diff --git a/tests/models/language/generation/test_granitemoehybrid.py b/tests/models/language/generation/test_granitemoehybrid.py index da3f5e1100bfd..952449f284159 100644 --- a/tests/models/language/generation/test_granitemoehybrid.py +++ b/tests/models/language/generation/test_granitemoehybrid.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py index 604cb854b32ff..3eaadcb45fe12 100644 --- a/tests/models/language/generation/test_hybrid.py +++ b/tests/models/language/generation/test_hybrid.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/models/language/generation/test_mistral.py b/tests/models/language/generation/test_mistral.py index c1b612ae213b9..bdd857ff50620 100644 --- a/tests/models/language/generation/test_mistral.py +++ b/tests/models/language/generation/test_mistral.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import copy import json diff --git a/tests/models/language/generation/test_phimoe.py b/tests/models/language/generation/test_phimoe.py index 603ca1cb12a5b..6c9cc2821c30f 100644 --- a/tests/models/language/generation/test_phimoe.py +++ b/tests/models/language/generation/test_phimoe.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch diff --git a/tests/models/language/pooling/embed_utils.py b/tests/models/language/pooling/embed_utils.py index 0c8ac2ab1b9eb..07bc9f447e336 100644 --- a/tests/models/language/pooling/embed_utils.py +++ b/tests/models/language/pooling/embed_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Sequence from typing import Optional diff --git a/tests/models/language/pooling/mteb_utils.py b/tests/models/language/pooling/mteb_utils.py index f45168bc0f1d6..2705be25e7cc7 100644 --- a/tests/models/language/pooling/mteb_utils.py +++ b/tests/models/language/pooling/mteb_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Sequence import mteb diff --git a/tests/models/language/pooling/test_baai.py b/tests/models/language/pooling/test_baai.py index fc0e8207954fa..1af3c05d3d907 100644 --- a/tests/models/language/pooling/test_baai.py +++ b/tests/models/language/pooling/test_baai.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest from .embed_utils import EmbedModelInfo, correctness_test_embed_models diff --git a/tests/models/language/pooling/test_classification.py b/tests/models/language/pooling/test_classification.py index 57b3cb58d88ba..4a6d781ce6f09 100644 --- a/tests/models/language/pooling/test_classification.py +++ b/tests/models/language/pooling/test_classification.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch from transformers import AutoModelForSequenceClassification diff --git a/tests/models/language/pooling/test_embedding.py b/tests/models/language/pooling/test_embedding.py index 8f82c8091af37..9516a01421cbb 100644 --- a/tests/models/language/pooling/test_embedding.py +++ b/tests/models/language/pooling/test_embedding.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest from vllm.config import PoolerConfig diff --git a/tests/models/language/pooling/test_gritlm.py b/tests/models/language/pooling/test_gritlm.py index f450edd821623..c2f70bb647a4e 100644 --- a/tests/models/language/pooling/test_gritlm.py +++ b/tests/models/language/pooling/test_gritlm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from __future__ import annotations import importlib.util diff --git a/tests/models/language/pooling/test_gte.py b/tests/models/language/pooling/test_gte.py index 725e3d168408b..2178a815b71c8 100644 --- a/tests/models/language/pooling/test_gte.py +++ b/tests/models/language/pooling/test_gte.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any import pytest diff --git a/tests/models/language/pooling/test_jina.py b/tests/models/language/pooling/test_jina.py index 0403a20a445af..2adf34b292872 100644 --- a/tests/models/language/pooling/test_jina.py +++ b/tests/models/language/pooling/test_jina.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from functools import partial import pytest diff --git a/tests/models/language/pooling/test_nomic.py b/tests/models/language/pooling/test_nomic.py index 92cd7cc569d39..59dbd74fb6fb6 100644 --- a/tests/models/language/pooling/test_nomic.py +++ b/tests/models/language/pooling/test_nomic.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/models/language/pooling/test_nomic_max_model_len.py b/tests/models/language/pooling/test_nomic_max_model_len.py index 68603e62843eb..250b3a52835af 100644 --- a/tests/models/language/pooling/test_nomic_max_model_len.py +++ b/tests/models/language/pooling/test_nomic_max_model_len.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # ruff: noqa: SIM117 import pytest diff --git a/tests/models/language/pooling/test_scoring.py b/tests/models/language/pooling/test_scoring.py index 6b10aeffc4b72..c75ff14456169 100644 --- a/tests/models/language/pooling/test_scoring.py +++ b/tests/models/language/pooling/test_scoring.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch import torch.nn.functional as F diff --git a/tests/models/language/pooling/test_snowflake_arctic_embed.py b/tests/models/language/pooling/test_snowflake_arctic_embed.py index c6c2d1e7a679d..d6b5dbd08372e 100644 --- a/tests/models/language/pooling/test_snowflake_arctic_embed.py +++ b/tests/models/language/pooling/test_snowflake_arctic_embed.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/models/language/pooling/test_truncation_control.py b/tests/models/language/pooling/test_truncation_control.py index 1b8ac395ed179..33aff1c873fc4 100644 --- a/tests/models/language/pooling/test_truncation_control.py +++ b/tests/models/language/pooling/test_truncation_control.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest MODEL_NAME = "sentence-transformers/all-MiniLM-L12-v2" diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py index e4e48f9951cf2..a5bbcfc22e9cd 100644 --- a/tests/models/multimodal/generation/test_common.py +++ b/tests/models/multimodal/generation/test_common.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Common tests for testing .generate() functionality for single / multiple image, embedding, and video support for different VLMs in vLLM. """ diff --git a/tests/models/multimodal/generation/test_florence2.py b/tests/models/multimodal/generation/test_florence2.py index b8225f5f12437..b048cec5e5e0f 100644 --- a/tests/models/multimodal/generation/test_florence2.py +++ b/tests/models/multimodal/generation/test_florence2.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/tests/models/multimodal/generation/test_granite_speech.py b/tests/models/multimodal/generation/test_granite_speech.py index 96c444441e3d2..14552010d3762 100644 --- a/tests/models/multimodal/generation/test_granite_speech.py +++ b/tests/models/multimodal/generation/test_granite_speech.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Sequence from typing import Optional diff --git a/tests/models/multimodal/generation/test_interleaved.py b/tests/models/multimodal/generation/test_interleaved.py index 972db40e8bd61..949c0a80d31bc 100644 --- a/tests/models/multimodal/generation/test_interleaved.py +++ b/tests/models/multimodal/generation/test_interleaved.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/models/multimodal/generation/test_mllama.py b/tests/models/multimodal/generation/test_mllama.py index 99aa3c2d3bd99..2bb01e494d436 100644 --- a/tests/models/multimodal/generation/test_mllama.py +++ b/tests/models/multimodal/generation/test_mllama.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional, overload diff --git a/tests/models/multimodal/generation/test_phi4mm.py b/tests/models/multimodal/generation/test_phi4mm.py index e51dbee479c55..e4cd476a96b1d 100644 --- a/tests/models/multimodal/generation/test_phi4mm.py +++ b/tests/models/multimodal/generation/test_phi4mm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os from collections.abc import Sequence diff --git a/tests/models/multimodal/generation/test_pixtral.py b/tests/models/multimodal/generation/test_pixtral.py index 506b71472f4a8..1def825ab0874 100644 --- a/tests/models/multimodal/generation/test_pixtral.py +++ b/tests/models/multimodal/generation/test_pixtral.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json from dataclasses import asdict from typing import TYPE_CHECKING, Any, Optional diff --git a/tests/models/multimodal/generation/test_qwen2_vl.py b/tests/models/multimodal/generation/test_qwen2_vl.py index 6be401b775ec2..a2793b8c8ddf7 100644 --- a/tests/models/multimodal/generation/test_qwen2_vl.py +++ b/tests/models/multimodal/generation/test_qwen2_vl.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any, Optional, TypedDict, Union diff --git a/tests/models/multimodal/generation/test_ultravox.py b/tests/models/multimodal/generation/test_ultravox.py index 2c8a06688ca02..e7e7bd3154a11 100644 --- a/tests/models/multimodal/generation/test_ultravox.py +++ b/tests/models/multimodal/generation/test_ultravox.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json from typing import Any diff --git a/tests/models/multimodal/generation/test_whisper.py b/tests/models/multimodal/generation/test_whisper.py index d0b85842a3d8f..363d55153aac6 100644 --- a/tests/models/multimodal/generation/test_whisper.py +++ b/tests/models/multimodal/generation/test_whisper.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional import pytest diff --git a/tests/models/multimodal/generation/vlm_utils/builders.py b/tests/models/multimodal/generation/vlm_utils/builders.py index 32117c8d8dca0..7d20dd66089bb 100644 --- a/tests/models/multimodal/generation/vlm_utils/builders.py +++ b/tests/models/multimodal/generation/vlm_utils/builders.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Helpers for building inputs that can be leveraged for different test types. """ from collections.abc import Iterable diff --git a/tests/models/multimodal/generation/vlm_utils/case_filtering.py b/tests/models/multimodal/generation/vlm_utils/case_filtering.py index a5077a090b523..336e2dd2b1201 100644 --- a/tests/models/multimodal/generation/vlm_utils/case_filtering.py +++ b/tests/models/multimodal/generation/vlm_utils/case_filtering.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Utils for determining which subset of model tests belong to a specific modality, getting all combinations (similar to pytest's parametrization), handling multimodal placeholder substitution, and so on. diff --git a/tests/models/multimodal/generation/vlm_utils/core.py b/tests/models/multimodal/generation/vlm_utils/core.py index ccd2799abd90c..8c83d8f8a8a22 100644 --- a/tests/models/multimodal/generation/vlm_utils/core.py +++ b/tests/models/multimodal/generation/vlm_utils/core.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Core test implementation to be shared across modalities.""" from typing import Any, Callable, Optional diff --git a/tests/models/multimodal/generation/vlm_utils/custom_inputs.py b/tests/models/multimodal/generation/vlm_utils/custom_inputs.py index cc10455611386..aa5835243e042 100644 --- a/tests/models/multimodal/generation/vlm_utils/custom_inputs.py +++ b/tests/models/multimodal/generation/vlm_utils/custom_inputs.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Custom input builders for edge-cases in different models.""" from io import BytesIO from typing import Callable diff --git a/tests/models/multimodal/generation/vlm_utils/model_utils.py b/tests/models/multimodal/generation/vlm_utils/model_utils.py index dc1ea5208240d..1b087191f6363 100644 --- a/tests/models/multimodal/generation/vlm_utils/model_utils.py +++ b/tests/models/multimodal/generation/vlm_utils/model_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Common utility functions relating to different models that are useful for manipulating the input / output of HF & vLLM test runners, which are typically specific to a small subset of models. diff --git a/tests/models/multimodal/generation/vlm_utils/runners.py b/tests/models/multimodal/generation/vlm_utils/runners.py index 9e8a1262e8c1c..562f89df13470 100644 --- a/tests/models/multimodal/generation/vlm_utils/runners.py +++ b/tests/models/multimodal/generation/vlm_utils/runners.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Entrypoints for wrapping the core run_test implementation for specific test types / modalities. """ diff --git a/tests/models/multimodal/generation/vlm_utils/types.py b/tests/models/multimodal/generation/vlm_utils/types.py index 1c2bb4d6222b4..0ec7909e744d7 100644 --- a/tests/models/multimodal/generation/vlm_utils/types.py +++ b/tests/models/multimodal/generation/vlm_utils/types.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Types for writing multimodal model tests.""" from collections.abc import Iterable from enum import Enum diff --git a/tests/models/multimodal/pooling/test_dse_qwen2_vl.py b/tests/models/multimodal/pooling/test_dse_qwen2_vl.py index ea1caec0ecf34..3734d87b7962e 100644 --- a/tests/models/multimodal/pooling/test_dse_qwen2_vl.py +++ b/tests/models/multimodal/pooling/test_dse_qwen2_vl.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Callable diff --git a/tests/models/multimodal/pooling/test_intern_vit.py b/tests/models/multimodal/pooling/test_intern_vit.py index 76f9fbe025505..3e2be34a50ad5 100644 --- a/tests/models/multimodal/pooling/test_intern_vit.py +++ b/tests/models/multimodal/pooling/test_intern_vit.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch import torch.nn as nn diff --git a/tests/models/multimodal/pooling/test_llava_next.py b/tests/models/multimodal/pooling/test_llava_next.py index 77508738cc870..b6d90d2b0abed 100644 --- a/tests/models/multimodal/pooling/test_llava_next.py +++ b/tests/models/multimodal/pooling/test_llava_next.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch.nn.functional as F diff --git a/tests/models/multimodal/pooling/test_phi3v.py b/tests/models/multimodal/pooling/test_phi3v.py index cd58a5cb4531c..b42ac6fb21edd 100644 --- a/tests/models/multimodal/pooling/test_phi3v.py +++ b/tests/models/multimodal/pooling/test_phi3v.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch.nn.functional as F diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index 2377fef820ed1..be574435e0995 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from functools import partial from typing import Optional, Union diff --git a/tests/models/multimodal/processing/test_h2ovl.py b/tests/models/multimodal/processing/test_h2ovl.py index 37142b6dd36f1..76e4acc67d4d5 100644 --- a/tests/models/multimodal/processing/test_h2ovl.py +++ b/tests/models/multimodal/processing/test_h2ovl.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests for H2OVL's multimodal preprocessing kwargs.""" from collections.abc import Mapping from typing import Optional diff --git a/tests/models/multimodal/processing/test_idefics3.py b/tests/models/multimodal/processing/test_idefics3.py index c35ce2f6ab291..d3a55993e5588 100644 --- a/tests/models/multimodal/processing/test_idefics3.py +++ b/tests/models/multimodal/processing/test_idefics3.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests for Idefics3's multimodal preprocessing kwargs.""" import pytest from transformers import Idefics3Config diff --git a/tests/models/multimodal/processing/test_internvl.py b/tests/models/multimodal/processing/test_internvl.py index 7ec81197a3db6..c3e2841a8f060 100644 --- a/tests/models/multimodal/processing/test_internvl.py +++ b/tests/models/multimodal/processing/test_internvl.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests for InternVL's multimodal preprocessing kwargs.""" from collections.abc import Mapping from typing import Optional diff --git a/tests/models/multimodal/processing/test_llama4.py b/tests/models/multimodal/processing/test_llama4.py index 614f17dbbeda7..9ef7af556291e 100644 --- a/tests/models/multimodal/processing/test_llama4.py +++ b/tests/models/multimodal/processing/test_llama4.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests for Llama4's multimodal preprocessing kwargs.""" import pytest diff --git a/tests/models/multimodal/processing/test_llava_next.py b/tests/models/multimodal/processing/test_llava_next.py index b82bfe483dbbc..ca34d1d758a46 100644 --- a/tests/models/multimodal/processing/test_llava_next.py +++ b/tests/models/multimodal/processing/test_llava_next.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import itertools from functools import partial diff --git a/tests/models/multimodal/processing/test_llava_onevision.py b/tests/models/multimodal/processing/test_llava_onevision.py index dcc8dc8dab5a0..e6344c4e7e6fd 100644 --- a/tests/models/multimodal/processing/test_llava_onevision.py +++ b/tests/models/multimodal/processing/test_llava_onevision.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import itertools from functools import partial diff --git a/tests/models/multimodal/processing/test_minimax_vl_01.py b/tests/models/multimodal/processing/test_minimax_vl_01.py index 9bd2b9887294f..9387212e3f101 100644 --- a/tests/models/multimodal/processing/test_minimax_vl_01.py +++ b/tests/models/multimodal/processing/test_minimax_vl_01.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest from PIL import Image diff --git a/tests/models/multimodal/processing/test_mllama.py b/tests/models/multimodal/processing/test_mllama.py index d4794396f6d20..a6b20a1e3678e 100644 --- a/tests/models/multimodal/processing/test_mllama.py +++ b/tests/models/multimodal/processing/test_mllama.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests for mllama's multimodal preprocessing and profiling.""" import pytest from transformers import MllamaConfig diff --git a/tests/models/multimodal/processing/test_phi3v.py b/tests/models/multimodal/processing/test_phi3v.py index b53351544c458..1f3646f794868 100644 --- a/tests/models/multimodal/processing/test_phi3v.py +++ b/tests/models/multimodal/processing/test_phi3v.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests for phi3v's multimodal preprocessing kwargs.""" import pytest diff --git a/tests/models/multimodal/processing/test_phi4mm.py b/tests/models/multimodal/processing/test_phi4mm.py index c6e272650e08b..f16d261c2c6a4 100644 --- a/tests/models/multimodal/processing/test_phi4mm.py +++ b/tests/models/multimodal/processing/test_phi4mm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests for phi4mm's multimodal preprocessing kwargs.""" import pytest diff --git a/tests/models/multimodal/processing/test_qwen2_vl.py b/tests/models/multimodal/processing/test_qwen2_vl.py index 02abe1ca8b024..9d1cd183387bc 100644 --- a/tests/models/multimodal/processing/test_qwen2_vl.py +++ b/tests/models/multimodal/processing/test_qwen2_vl.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/models/multimodal/processing/test_smolvlm.py b/tests/models/multimodal/processing/test_smolvlm.py index 224d1bcedb966..af8f983388c6c 100644 --- a/tests/models/multimodal/processing/test_smolvlm.py +++ b/tests/models/multimodal/processing/test_smolvlm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests for smolvlm's multimodal preprocessing kwargs.""" import pytest from transformers import SmolVLMConfig diff --git a/tests/models/quantization/test_aqlm.py b/tests/models/quantization/test_aqlm.py index 1272a62974cc8..de6851e2fc282 100644 --- a/tests/models/quantization/test_aqlm.py +++ b/tests/models/quantization/test_aqlm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest from tests.quantization.utils import is_quant_method_supported diff --git a/tests/models/quantization/test_awq.py b/tests/models/quantization/test_awq.py index 597c8e48fb64d..bd696198931ff 100644 --- a/tests/models/quantization/test_awq.py +++ b/tests/models/quantization/test_awq.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/tests/models/quantization/test_bitblas.py b/tests/models/quantization/test_bitblas.py index f0781394d81d1..754ac9a29a132 100644 --- a/tests/models/quantization/test_bitblas.py +++ b/tests/models/quantization/test_bitblas.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Compare the outputs of a GPTQ model to a bitblas model. Note: GPTQ and bitblas do not have bitwise correctness. diff --git a/tests/models/quantization/test_fp8.py b/tests/models/quantization/test_fp8.py index e01ee20263935..10914abf9ad3d 100644 --- a/tests/models/quantization/test_fp8.py +++ b/tests/models/quantization/test_fp8.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # flake8: noqa """Tests fp8 models against ground truth generation diff --git a/tests/models/quantization/test_gguf.py b/tests/models/quantization/test_gguf.py index 5f17d12284a04..eafdfd1b09aff 100644 --- a/tests/models/quantization/test_gguf.py +++ b/tests/models/quantization/test_gguf.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Tests gguf models against unquantized models generations Note: To pass the test, quantization higher than Q4 should be used diff --git a/tests/models/quantization/test_gptq_bitblas.py b/tests/models/quantization/test_gptq_bitblas.py index c8e96455fd0c5..c3aed77525de9 100644 --- a/tests/models/quantization/test_gptq_bitblas.py +++ b/tests/models/quantization/test_gptq_bitblas.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Compare the outputs of a GPTQ model to a bitblas model. Note: GPTQ and bitblas do not have bitwise correctness. diff --git a/tests/models/quantization/test_gptq_marlin.py b/tests/models/quantization/test_gptq_marlin.py index 397bdb98123f1..db70a3bd2c046 100644 --- a/tests/models/quantization/test_gptq_marlin.py +++ b/tests/models/quantization/test_gptq_marlin.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Compares the outputs of gptq vs gptq_marlin. Note: GPTQ and Marlin do not have bitwise correctness. diff --git a/tests/models/quantization/test_gptq_marlin_24.py b/tests/models/quantization/test_gptq_marlin_24.py index 6fb24b1f432e6..9b86ae95ba5c7 100644 --- a/tests/models/quantization/test_gptq_marlin_24.py +++ b/tests/models/quantization/test_gptq_marlin_24.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Compare the outputs of a GPTQ model to a Marlin_24 model. Note: GPTQ and Marlin_24 do not have bitwise correctness. diff --git a/tests/models/quantization/test_modelopt.py b/tests/models/quantization/test_modelopt.py index 1d9aa4fa8adea..6ad526cc893f3 100644 --- a/tests/models/quantization/test_modelopt.py +++ b/tests/models/quantization/test_modelopt.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # flake8: noqa """Tests Model Optimizer fp8 models against ground truth generation diff --git a/tests/models/quantization/test_mxfp4.py b/tests/models/quantization/test_mxfp4.py index 9a060829525e1..7b8a334bbc369 100644 --- a/tests/models/quantization/test_mxfp4.py +++ b/tests/models/quantization/test_mxfp4.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # flake8: noqa """Tests Quark mxfp4 models against ground truth generation """ diff --git a/tests/models/quantization/test_nvfp4.py b/tests/models/quantization/test_nvfp4.py index 510858c2d7ef2..b95dad9a4effe 100644 --- a/tests/models/quantization/test_nvfp4.py +++ b/tests/models/quantization/test_nvfp4.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # flake8: noqa """Tests Model Optimizer nvfp4 models against ground truth generation Note: these tests will only pass on B200 diff --git a/tests/models/registry.py b/tests/models/registry.py index 182a9668ebef1..ed49676a9f5d6 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Mapping, Set from dataclasses import dataclass, field diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py index d403cb392fe06..af023d9034383 100644 --- a/tests/models/test_initialization.py +++ b/tests/models/test_initialization.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from unittest.mock import patch diff --git a/tests/models/test_oot_registration.py b/tests/models/test_oot_registration.py index b62720caa9cb5..ef0ad613d5252 100644 --- a/tests/models/test_oot_registration.py +++ b/tests/models/test_oot_registration.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/models/test_registry.py b/tests/models/test_registry.py index 3282284b6b27c..b7527ca2706b6 100644 --- a/tests/models/test_registry.py +++ b/tests/models/test_registry.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import warnings diff --git a/tests/models/test_transformers.py b/tests/models/test_transformers.py index 1a51b4aeab04d..b7b99ce41cbb0 100644 --- a/tests/models/test_transformers.py +++ b/tests/models/test_transformers.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Test the functionality of the Transformers backend.""" from typing import Any, Optional, Union diff --git a/tests/models/test_utils.py b/tests/models/test_utils.py index a16384efe1956..b52327a1844f6 100644 --- a/tests/models/test_utils.py +++ b/tests/models/test_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch diff --git a/tests/models/test_vision.py b/tests/models/test_vision.py index d64c0e6d4e430..310d3a3719b65 100644 --- a/tests/models/test_vision.py +++ b/tests/models/test_vision.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch diff --git a/tests/models/utils.py b/tests/models/utils.py index ffc904bd10f46..943b4f5704468 100644 --- a/tests/models/utils.py +++ b/tests/models/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import warnings from collections.abc import Sequence diff --git a/tests/mq_llm_engine/conftest.py b/tests/mq_llm_engine/conftest.py index 1a20e2c135c2e..375b248ebedaa 100644 --- a/tests/mq_llm_engine/conftest.py +++ b/tests/mq_llm_engine/conftest.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/mq_llm_engine/test_abort.py b/tests/mq_llm_engine/test_abort.py index 808346b5e58d5..5ff08cbb32487 100644 --- a/tests/mq_llm_engine/test_abort.py +++ b/tests/mq_llm_engine/test_abort.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Test that aborting is handled properly.""" import asyncio diff --git a/tests/mq_llm_engine/test_error_handling.py b/tests/mq_llm_engine/test_error_handling.py index e617bd057f1f4..49b02279d61bb 100644 --- a/tests/mq_llm_engine/test_error_handling.py +++ b/tests/mq_llm_engine/test_error_handling.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Test that various errors are handled properly.""" import asyncio diff --git a/tests/mq_llm_engine/test_load.py b/tests/mq_llm_engine/test_load.py index 2069ff987f2fe..e9fd5b814f285 100644 --- a/tests/mq_llm_engine/test_load.py +++ b/tests/mq_llm_engine/test_load.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Test that the MQLLMEngine is able to handle 10k concurrent requests.""" import asyncio diff --git a/tests/mq_llm_engine/utils.py b/tests/mq_llm_engine/utils.py index 64559609abb2d..7976d5031aea1 100644 --- a/tests/mq_llm_engine/utils.py +++ b/tests/mq_llm_engine/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio import multiprocessing diff --git a/tests/multi_step/test_correctness_async_llm.py b/tests/multi_step/test_correctness_async_llm.py index ce716e6474cb4..56e339d485c56 100644 --- a/tests/multi_step/test_correctness_async_llm.py +++ b/tests/multi_step/test_correctness_async_llm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Test the AsyncLLMEngine with multi-step-decoding from typing import Optional diff --git a/tests/multi_step/test_correctness_llm.py b/tests/multi_step/test_correctness_llm.py index a823e484beab6..9f1b3bbe8e226 100644 --- a/tests/multi_step/test_correctness_llm.py +++ b/tests/multi_step/test_correctness_llm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Test the LLMEngine with multi-step-decoding diff --git a/tests/multimodal/test_hasher.py b/tests/multimodal/test_hasher.py index 17b36b36888d5..b5048c8cc3ad8 100644 --- a/tests/multimodal/test_hasher.py +++ b/tests/multimodal/test_hasher.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from pathlib import Path import numpy as np diff --git a/tests/multimodal/test_image.py b/tests/multimodal/test_image.py index 56b5475c9ca04..cfd44351a6d1f 100644 --- a/tests/multimodal/test_image.py +++ b/tests/multimodal/test_image.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from pathlib import Path import numpy as np diff --git a/tests/multimodal/test_inputs.py b/tests/multimodal/test_inputs.py index f5d3e282f953d..ffb3a6fe86b46 100644 --- a/tests/multimodal/test_inputs.py +++ b/tests/multimodal/test_inputs.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py index 59f7bf8fab2fe..8b52911c6ccf3 100644 --- a/tests/multimodal/test_processing.py +++ b/tests/multimodal/test_processing.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from contextlib import nullcontext from types import MethodType diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py index f1e45da30eda4..e4debb47cec1e 100644 --- a/tests/multimodal/test_utils.py +++ b/tests/multimodal/test_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import base64 import mimetypes diff --git a/tests/multimodal/test_video.py b/tests/multimodal/test_video.py index e67624ecefcb6..9a700808d9d8a 100644 --- a/tests/multimodal/test_video.py +++ b/tests/multimodal/test_video.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import numpy as np import numpy.typing as npt import pytest diff --git a/tests/multimodal/utils.py b/tests/multimodal/utils.py index 40fcfeeeac7d0..23346509a06fd 100644 --- a/tests/multimodal/utils.py +++ b/tests/multimodal/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import numpy as np from PIL import Image diff --git a/tests/neuron/1_core/test_activation.py b/tests/neuron/1_core/test_activation.py index ec2b1238e4042..2d6e5f523cb85 100644 --- a/tests/neuron/1_core/test_activation.py +++ b/tests/neuron/1_core/test_activation.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch diff --git a/tests/neuron/1_core/test_block_table.py b/tests/neuron/1_core/test_block_table.py index 033a36b4156b0..efec56360c142 100644 --- a/tests/neuron/1_core/test_block_table.py +++ b/tests/neuron/1_core/test_block_table.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import neuronxcc.nki.language as nl import pytest diff --git a/tests/neuron/1_core/test_cache.py b/tests/neuron/1_core/test_cache.py index 3d869cd2fa17f..670889ad6b58d 100644 --- a/tests/neuron/1_core/test_cache.py +++ b/tests/neuron/1_core/test_cache.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch diff --git a/tests/neuron/1_core/test_layernorm.py b/tests/neuron/1_core/test_layernorm.py index e96df8db6ccdf..c6fce1d1a0630 100644 --- a/tests/neuron/1_core/test_layernorm.py +++ b/tests/neuron/1_core/test_layernorm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch diff --git a/tests/neuron/1_core/test_logits_processor.py b/tests/neuron/1_core/test_logits_processor.py index 6d1514088f90c..ce9eadf5a883e 100644 --- a/tests/neuron/1_core/test_logits_processor.py +++ b/tests/neuron/1_core/test_logits_processor.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import random from unittest.mock import patch diff --git a/tests/neuron/1_core/test_neuron_model_runner.py b/tests/neuron/1_core/test_neuron_model_runner.py index 92417fb64f7f8..5f3268810f9fe 100644 --- a/tests/neuron/1_core/test_neuron_model_runner.py +++ b/tests/neuron/1_core/test_neuron_model_runner.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os from unittest.mock import MagicMock diff --git a/tests/neuron/1_core/test_neuron_quant.py b/tests/neuron/1_core/test_neuron_quant.py index 68f0cb8054b4f..0863002695928 100644 --- a/tests/neuron/1_core/test_neuron_quant.py +++ b/tests/neuron/1_core/test_neuron_quant.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm.model_executor.layers.quantization.neuron_quant import ( NeuronQuantConfig) diff --git a/tests/neuron/1_core/test_prefix_prefill.py b/tests/neuron/1_core/test_prefix_prefill.py index 8f7e711b525e3..8b9a5f6e4a6af 100644 --- a/tests/neuron/1_core/test_prefix_prefill.py +++ b/tests/neuron/1_core/test_prefix_prefill.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/tests/neuron/1_core/test_rotary_embedding.py b/tests/neuron/1_core/test_rotary_embedding.py index da57631fcfc59..a7ac79729986d 100644 --- a/tests/neuron/1_core/test_rotary_embedding.py +++ b/tests/neuron/1_core/test_rotary_embedding.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Tests for miscellaneous utilities """ diff --git a/tests/neuron/2_core/test_comm_ops.py b/tests/neuron/2_core/test_comm_ops.py index 3cad160b2cb78..85a48dae58aaf 100644 --- a/tests/neuron/2_core/test_comm_ops.py +++ b/tests/neuron/2_core/test_comm_ops.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import functools from typing import Callable from unittest.mock import patch diff --git a/tests/neuron/2_core/test_eagle.py b/tests/neuron/2_core/test_eagle.py index d71c88689a994..cac642af03101 100644 --- a/tests/neuron/2_core/test_eagle.py +++ b/tests/neuron/2_core/test_eagle.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json import os diff --git a/tests/neuron/2_core/test_mistral.py b/tests/neuron/2_core/test_mistral.py index 3e651502d1e2a..d02fff943e90a 100644 --- a/tests/neuron/2_core/test_mistral.py +++ b/tests/neuron/2_core/test_mistral.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm import LLM, SamplingParams diff --git a/tests/neuron/2_core/test_multi_lora.py b/tests/neuron/2_core/test_multi_lora.py index 6fa8f9128def7..6b97f47d4db34 100644 --- a/tests/neuron/2_core/test_multi_lora.py +++ b/tests/neuron/2_core/test_multi_lora.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from huggingface_hub import snapshot_download diff --git a/tests/plugins/lora_resolvers/test_filesystem_resolver.py b/tests/plugins/lora_resolvers/test_filesystem_resolver.py index cb0f0c3c5fa61..3e2c2577da66c 100644 --- a/tests/plugins/lora_resolvers/test_filesystem_resolver.py +++ b/tests/plugins/lora_resolvers/test_filesystem_resolver.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os import shutil diff --git a/tests/plugins/vllm_add_dummy_model/setup.py b/tests/plugins/vllm_add_dummy_model/setup.py index e3fb6efb27576..6307bb63897ac 100644 --- a/tests/plugins/vllm_add_dummy_model/setup.py +++ b/tests/plugins/vllm_add_dummy_model/setup.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from setuptools import setup diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py index 0c431cb39737b..b2085b01c45c1 100644 --- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py +++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm import ModelRegistry diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py index bc4a41cdf00de..aff3498567d2e 100644 --- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py +++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable from typing import Optional, Union diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py index c23ab64308f20..da97cf7e2b40b 100644 --- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py +++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py index bbd11ed4aac9d..8c34407e3e071 100644 --- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py +++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/tests/plugins/vllm_add_dummy_platform/setup.py b/tests/plugins/vllm_add_dummy_platform/setup.py index 10df0b5e05035..e40f62f7749be 100644 --- a/tests/plugins/vllm_add_dummy_platform/setup.py +++ b/tests/plugins/vllm_add_dummy_platform/setup.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from setuptools import setup diff --git a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py index 0d1b062ac2eb5..1b28342eb1791 100644 --- a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py +++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py index 33425bbc11ed9..f30a36f35f5d5 100644 --- a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py +++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm.attention.backends.flash_attn import FlashAttentionBackend diff --git a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py index 5cefafc7e06c7..67cd5ed3b73df 100644 --- a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py +++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm.platforms.cuda import CudaPlatform diff --git a/tests/plugins_tests/conftest.py b/tests/plugins_tests/conftest.py index 8561f2ddfa266..c8c1b81ca2183 100644 --- a/tests/plugins_tests/conftest.py +++ b/tests/plugins_tests/conftest.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/plugins_tests/test_platform_plugins.py b/tests/plugins_tests/test_platform_plugins.py index 207de53abd8d1..685a8cd2c8b82 100644 --- a/tests/plugins_tests/test_platform_plugins.py +++ b/tests/plugins_tests/test_platform_plugins.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch diff --git a/tests/plugins_tests/test_scheduler_plugins.py b/tests/plugins_tests/test_scheduler_plugins.py index 4c95a52a967bd..8c21216108685 100644 --- a/tests/plugins_tests/test_scheduler_plugins.py +++ b/tests/plugins_tests/test_scheduler_plugins.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/prefix_caching/test_disable_sliding_window.py b/tests/prefix_caching/test_disable_sliding_window.py index 4cc399175df41..f00a8f6998cbd 100644 --- a/tests/prefix_caching/test_disable_sliding_window.py +++ b/tests/prefix_caching/test_disable_sliding_window.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Compare the with and without prefix caching. Run `pytest tests/prefix_caching/test_prefix_caching.py`. diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py index 607b6c43e02e2..a65fc934b16ab 100644 --- a/tests/prefix_caching/test_prefix_caching.py +++ b/tests/prefix_caching/test_prefix_caching.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Compare the with and without prefix caching. Run `pytest tests/prefix_caching/test_prefix_caching.py`. diff --git a/tests/prompt_adapter/test_bloom.py b/tests/prompt_adapter/test_bloom.py index a31d8e873d798..2b603fe8f0228 100644 --- a/tests/prompt_adapter/test_bloom.py +++ b/tests/prompt_adapter/test_bloom.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/prompt_adapter/test_multi_adapter_inference.py b/tests/prompt_adapter/test_multi_adapter_inference.py index e249a6e64427a..4f273afb4e368 100644 --- a/tests/prompt_adapter/test_multi_adapter_inference.py +++ b/tests/prompt_adapter/test_multi_adapter_inference.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm import EngineArgs, LLMEngine, SamplingParams from vllm.prompt_adapter.request import PromptAdapterRequest diff --git a/tests/prompt_adapter/test_pa_lora.py b/tests/prompt_adapter/test_pa_lora.py index fb4c3e1497652..ba2e15b81bc1e 100644 --- a/tests/prompt_adapter/test_pa_lora.py +++ b/tests/prompt_adapter/test_pa_lora.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from huggingface_hub import snapshot_download diff --git a/tests/quantization/test_auto_round.py b/tests/quantization/test_auto_round.py index 81ceecdb45d65..1c41d904b8168 100644 --- a/tests/quantization/test_auto_round.py +++ b/tests/quantization/test_auto_round.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Test model set-up and inference for quantized HF models supported on the AutoRound. diff --git a/tests/quantization/test_bitsandbytes.py b/tests/quantization/test_bitsandbytes.py index e8ddfd7fc7795..325a902b31112 100644 --- a/tests/quantization/test_bitsandbytes.py +++ b/tests/quantization/test_bitsandbytes.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project '''Tests whether bitsandbytes computation is enabled correctly. Run `pytest tests/quantization/test_bitsandbytes.py`. diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py index c968a68f1a8e8..807b24d4e3aaa 100644 --- a/tests/quantization/test_compressed_tensors.py +++ b/tests/quantization/test_compressed_tensors.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Test model set-up and weight loading for llmcompressor-quantized models. Run `pytest tests/quantization/test_compressed_tensors.py`. diff --git a/tests/quantization/test_configs.py b/tests/quantization/test_configs.py index e30166842ea8a..8b0ffc0fe42f1 100644 --- a/tests/quantization/test_configs.py +++ b/tests/quantization/test_configs.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests whether Marlin models can be loaded from the autogptq config. Run `pytest tests/quantization/test_configs.py --forked`. diff --git a/tests/quantization/test_cpu_offload.py b/tests/quantization/test_cpu_offload.py index a05eb494c11a7..08d9573ecf0b8 100644 --- a/tests/quantization/test_cpu_offload.py +++ b/tests/quantization/test_cpu_offload.py @@ -1,4 +1,5 @@ -# SPDX-License-Identifier: Apache-2.0 +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Expanded quantized model tests for CPU offloading # Base tests: tests/basic_correctness/test_cpu_offload.py diff --git a/tests/quantization/test_experts_int8.py b/tests/quantization/test_experts_int8.py index b6db6d5f2fdc5..50179b9a904d2 100644 --- a/tests/quantization/test_experts_int8.py +++ b/tests/quantization/test_experts_int8.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # flake8: noqa """Tests experts_int8 quantization startup and generation, diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py index e74e14a0dcb64..e5ab7b3dd3cfb 100644 --- a/tests/quantization/test_fp8.py +++ b/tests/quantization/test_fp8.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests whether FP8 computation is enabled correctly. Run `pytest tests/quantization/test_fp8.py --forked`. diff --git a/tests/quantization/test_gptq_dynamic.py b/tests/quantization/test_gptq_dynamic.py index 22055c49ae296..23b999e7c679b 100644 --- a/tests/quantization/test_gptq_dynamic.py +++ b/tests/quantization/test_gptq_dynamic.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests whether gptq models with dynamic quantized can be loaded. Run `pytest tests/quantization/test_gptq_dynamic.py --forked`. diff --git a/tests/quantization/test_ipex_quant.py b/tests/quantization/test_ipex_quant.py index 0e3913676f5f7..34b1b6c2e5b6d 100644 --- a/tests/quantization/test_ipex_quant.py +++ b/tests/quantization/test_ipex_quant.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Test model set-up and inference for quantized HF models supported on the CPU/GPU backend using IPEX (including AWQ/GPTQ). diff --git a/tests/quantization/test_lm_head.py b/tests/quantization/test_lm_head.py index 1c6bd18521c31..11f78a23bb4c0 100644 --- a/tests/quantization/test_lm_head.py +++ b/tests/quantization/test_lm_head.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests whether gptq models with quantized lm_head can be loaded. Run `pytest tests/quantization/test_quant_lm_head_true.py --forked`. diff --git a/tests/quantization/test_ptpc_fp8.py b/tests/quantization/test_ptpc_fp8.py index 9bbb5e327968f..5f78bc30504c0 100644 --- a/tests/quantization/test_ptpc_fp8.py +++ b/tests/quantization/test_ptpc_fp8.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests whether PTPC w8a8 FP8 computation is enabled correctly. Run `pytest tests/quantization/test_ptpc_fp8.py --forked`. diff --git a/tests/quantization/test_quark.py b/tests/quantization/test_quark.py index ae09ac58e6759..3571f773fb023 100644 --- a/tests/quantization/test_quark.py +++ b/tests/quantization/test_quark.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Test model set-up and weight loading for quark-quantized models. Run `pytest tests/quantization/test_quark.py`. diff --git a/tests/quantization/test_register_quantization_config.py b/tests/quantization/test_register_quantization_config.py index 0ea71aaf828bc..42081a8c68cdc 100644 --- a/tests/quantization/test_register_quantization_config.py +++ b/tests/quantization/test_register_quantization_config.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests register custom quantization config. See https://github.com/vllm-project/vllm/issues/11926 for more details. diff --git a/tests/quantization/test_torchao.py b/tests/quantization/test_torchao.py index 6571fc9e471bd..c966dc9b81525 100644 --- a/tests/quantization/test_torchao.py +++ b/tests/quantization/test_torchao.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import importlib.metadata import importlib.util diff --git a/tests/quantization/utils.py b/tests/quantization/utils.py index 7a339c162cc48..20a425b721145 100644 --- a/tests/quantization/utils.py +++ b/tests/quantization/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm.model_executor.layers.quantization import get_quantization_config from vllm.platforms import current_platform diff --git a/tests/reasoning/test_deepseekr1_reasoning_parser.py b/tests/reasoning/test_deepseekr1_reasoning_parser.py index 1b669c8fd2fb9..987f3c48de0c0 100644 --- a/tests/reasoning/test_deepseekr1_reasoning_parser.py +++ b/tests/reasoning/test_deepseekr1_reasoning_parser.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest from transformers import AutoTokenizer diff --git a/tests/reasoning/test_granite_reasoning_parser.py b/tests/reasoning/test_granite_reasoning_parser.py index 48fb8c2f8d1b9..38cab73a45f22 100644 --- a/tests/reasoning/test_granite_reasoning_parser.py +++ b/tests/reasoning/test_granite_reasoning_parser.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest from transformers import AutoTokenizer diff --git a/tests/reasoning/test_qwen3_reasoning_parser.py b/tests/reasoning/test_qwen3_reasoning_parser.py index 95b7460d359e4..2d5557d5cdc13 100644 --- a/tests/reasoning/test_qwen3_reasoning_parser.py +++ b/tests/reasoning/test_qwen3_reasoning_parser.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest from transformers import AutoTokenizer diff --git a/tests/reasoning/utils.py b/tests/reasoning/utils.py index 0f894ed800c6c..ddcf89796fb5a 100644 --- a/tests/reasoning/utils.py +++ b/tests/reasoning/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional, Union diff --git a/tests/runai_model_streamer_test/test_runai_model_streamer_loader.py b/tests/runai_model_streamer_test/test_runai_model_streamer_loader.py index 8b96184f579e4..e27d9958f2917 100644 --- a/tests/runai_model_streamer_test/test_runai_model_streamer_loader.py +++ b/tests/runai_model_streamer_test/test_runai_model_streamer_loader.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm import SamplingParams from vllm.config import LoadConfig, LoadFormat diff --git a/tests/runai_model_streamer_test/test_weight_utils.py b/tests/runai_model_streamer_test/test_weight_utils.py index 06e506c35761e..ee448c2ccb213 100644 --- a/tests/runai_model_streamer_test/test_weight_utils.py +++ b/tests/runai_model_streamer_test/test_weight_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import glob import tempfile diff --git a/tests/samplers/test_beam_search.py b/tests/samplers/test_beam_search.py index 5de1137eaf682..bdf48c7687b25 100644 --- a/tests/samplers/test_beam_search.py +++ b/tests/samplers/test_beam_search.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Compare the outputs of HF and vLLM when using beam search. Run `pytest tests/samplers/test_beam_search.py`. diff --git a/tests/samplers/test_ignore_eos.py b/tests/samplers/test_ignore_eos.py index 2a124aa0c5960..7eb9c0b5fb8c8 100644 --- a/tests/samplers/test_ignore_eos.py +++ b/tests/samplers/test_ignore_eos.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Make sure ignore_eos works. Run `pytest tests/samplers/test_ignore_eos.py`. diff --git a/tests/samplers/test_logits_processor.py b/tests/samplers/test_logits_processor.py index 74f1eb4a95477..901c875912643 100644 --- a/tests/samplers/test_logits_processor.py +++ b/tests/samplers/test_logits_processor.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py index 5cc646e76ec84..86c8a03eee10f 100644 --- a/tests/samplers/test_logprobs.py +++ b/tests/samplers/test_logprobs.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch diff --git a/tests/samplers/test_no_bad_words.py b/tests/samplers/test_no_bad_words.py index f9688b4b9b272..42b529ae169de 100644 --- a/tests/samplers/test_no_bad_words.py +++ b/tests/samplers/test_no_bad_words.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Make sure bad_words works. Run `pytest tests/samplers/test_no_bad_words.py`. diff --git a/tests/samplers/test_ranks.py b/tests/samplers/test_ranks.py index ebe9b302148c0..86fc14dc85f80 100644 --- a/tests/samplers/test_ranks.py +++ b/tests/samplers/test_ranks.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/samplers/test_rejection_sampler.py b/tests/samplers/test_rejection_sampler.py index 6ef61f2ff4069..3b93c64113dac 100644 --- a/tests/samplers/test_rejection_sampler.py +++ b/tests/samplers/test_rejection_sampler.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests for rejection sampling.""" import pytest diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py index 7b19d5750906d..520b88d03ac8e 100644 --- a/tests/samplers/test_sampler.py +++ b/tests/samplers/test_sampler.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import itertools import random diff --git a/tests/samplers/test_seeded_generate.py b/tests/samplers/test_seeded_generate.py index efa2642dba971..b339b4b2ddf3d 100644 --- a/tests/samplers/test_seeded_generate.py +++ b/tests/samplers/test_seeded_generate.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Verify that seeded random sampling is deterministic. Run `pytest tests/samplers/test_seeded_generate.py`. diff --git a/tests/samplers/test_typical_acceptance_sampler.py b/tests/samplers/test_typical_acceptance_sampler.py index 279e5ed100d97..418471b8e5238 100644 --- a/tests/samplers/test_typical_acceptance_sampler.py +++ b/tests/samplers/test_typical_acceptance_sampler.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests for rejection sampling.""" import pytest diff --git a/tests/spec_decode/conftest.py b/tests/spec_decode/conftest.py index 1a20e2c135c2e..375b248ebedaa 100644 --- a/tests/spec_decode/conftest.py +++ b/tests/spec_decode/conftest.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/spec_decode/e2e/conftest.py b/tests/spec_decode/e2e/conftest.py index 921081f3c3f2e..f3fe9db3f79ea 100644 --- a/tests/spec_decode/e2e/conftest.py +++ b/tests/spec_decode/e2e/conftest.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Sequence from itertools import cycle diff --git a/tests/spec_decode/e2e/test_compatibility.py b/tests/spec_decode/e2e/test_compatibility.py index 4fd52cf7e2cb3..6c453879a6a6a 100644 --- a/tests/spec_decode/e2e/test_compatibility.py +++ b/tests/spec_decode/e2e/test_compatibility.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/spec_decode/e2e/test_eagle_correctness.py b/tests/spec_decode/e2e/test_eagle_correctness.py index eee535a146f45..98939461422e1 100644 --- a/tests/spec_decode/e2e/test_eagle_correctness.py +++ b/tests/spec_decode/e2e/test_eagle_correctness.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """This docstring details important information on the testing methodology. Most of the tests rely on "greedy equality", where we expect the output of diff --git a/tests/spec_decode/e2e/test_integration.py b/tests/spec_decode/e2e/test_integration.py index 9dfc1b2fd91ef..7608618502966 100644 --- a/tests/spec_decode/e2e/test_integration.py +++ b/tests/spec_decode/e2e/test_integration.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests which cover integration of the speculative decoding framework with other features, e.g. cuda graphs. """ diff --git a/tests/spec_decode/e2e/test_integration_dist_tp2.py b/tests/spec_decode/e2e/test_integration_dist_tp2.py index b112974754208..a18be80c50dd9 100644 --- a/tests/spec_decode/e2e/test_integration_dist_tp2.py +++ b/tests/spec_decode/e2e/test_integration_dist_tp2.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests which cover integration of the speculative decoding framework with tensor parallelism. """ diff --git a/tests/spec_decode/e2e/test_integration_dist_tp4.py b/tests/spec_decode/e2e/test_integration_dist_tp4.py index a1b7c8b40c39d..039eec8fd2cc9 100644 --- a/tests/spec_decode/e2e/test_integration_dist_tp4.py +++ b/tests/spec_decode/e2e/test_integration_dist_tp4.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests which cover integration of the speculative decoding framework with tensor parallelism. """ diff --git a/tests/spec_decode/e2e/test_logprobs.py b/tests/spec_decode/e2e/test_logprobs.py index cb2dae541411a..1629c69f8ee9d 100644 --- a/tests/spec_decode/e2e/test_logprobs.py +++ b/tests/spec_decode/e2e/test_logprobs.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from itertools import cycle diff --git a/tests/spec_decode/e2e/test_medusa_correctness.py b/tests/spec_decode/e2e/test_medusa_correctness.py index 5c60100e6797e..064a6e10ae6ef 100644 --- a/tests/spec_decode/e2e/test_medusa_correctness.py +++ b/tests/spec_decode/e2e/test_medusa_correctness.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """This docstring details important information on the testing methodology. Most of the tests rely on "greedy equality", where we expect the output of diff --git a/tests/spec_decode/e2e/test_mlp_correctness.py b/tests/spec_decode/e2e/test_mlp_correctness.py index 7bf29349d6724..9f778ca8d179b 100644 --- a/tests/spec_decode/e2e/test_mlp_correctness.py +++ b/tests/spec_decode/e2e/test_mlp_correctness.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """This docstring details important information on the testing methodology. Most of the tests rely on "greedy equality", where we expect the output of diff --git a/tests/spec_decode/e2e/test_mtp_correctness.py b/tests/spec_decode/e2e/test_mtp_correctness.py index 371e6834b6398..d4d4d519b7a14 100644 --- a/tests/spec_decode/e2e/test_mtp_correctness.py +++ b/tests/spec_decode/e2e/test_mtp_correctness.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """This docstring details important information on the testing methodology. Most of the tests rely on "greedy equality", where we expect the output of diff --git a/tests/spec_decode/e2e/test_multistep_correctness.py b/tests/spec_decode/e2e/test_multistep_correctness.py index e187b6bc14347..6d385184d264a 100644 --- a/tests/spec_decode/e2e/test_multistep_correctness.py +++ b/tests/spec_decode/e2e/test_multistep_correctness.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """The tests in this file verify end-to-end speculative decoding correctness. This docstring details important information on the testing methodology. diff --git a/tests/spec_decode/e2e/test_ngram_correctness.py b/tests/spec_decode/e2e/test_ngram_correctness.py index eca433ffa1d0b..c10329a9ba974 100644 --- a/tests/spec_decode/e2e/test_ngram_correctness.py +++ b/tests/spec_decode/e2e/test_ngram_correctness.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """This docstring details important information on the testing methodology. Most of the tests rely on "greedy equality", where we expect the output of diff --git a/tests/spec_decode/e2e/test_seed.py b/tests/spec_decode/e2e/test_seed.py index 3dc37172285e9..4cf373809dba2 100644 --- a/tests/spec_decode/e2e/test_seed.py +++ b/tests/spec_decode/e2e/test_seed.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/spec_decode/test_batch_expansion.py b/tests/spec_decode/test_batch_expansion.py index 9edd8bd4c00d7..d20c549b09052 100644 --- a/tests/spec_decode/test_batch_expansion.py +++ b/tests/spec_decode/test_batch_expansion.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch diff --git a/tests/spec_decode/test_dynamic_spec_decode.py b/tests/spec_decode/test_dynamic_spec_decode.py index 0bff0ea1d7dba..407786ad3c647 100644 --- a/tests/spec_decode/test_dynamic_spec_decode.py +++ b/tests/spec_decode/test_dynamic_spec_decode.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from unittest.mock import MagicMock, patch diff --git a/tests/spec_decode/test_memory_usage.py b/tests/spec_decode/test_memory_usage.py index 16dffe6d7d699..5d9dd3f72a78a 100644 --- a/tests/spec_decode/test_memory_usage.py +++ b/tests/spec_decode/test_memory_usage.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """This docstring details important information on the testing methodology. This test verifies that memory usage remains constant (or never grows) when diff --git a/tests/spec_decode/test_metrics.py b/tests/spec_decode/test_metrics.py index 1a6693e168173..e8de410f8a941 100644 --- a/tests/spec_decode/test_metrics.py +++ b/tests/spec_decode/test_metrics.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import math from unittest.mock import MagicMock diff --git a/tests/spec_decode/test_multi_step_worker.py b/tests/spec_decode/test_multi_step_worker.py index ca37c9a68dfa4..f2d93203b8e10 100644 --- a/tests/spec_decode/test_multi_step_worker.py +++ b/tests/spec_decode/test_multi_step_worker.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import random from unittest.mock import MagicMock diff --git a/tests/spec_decode/test_ngram_worker.py b/tests/spec_decode/test_ngram_worker.py index 7de54b3edb6c6..8a7c114856811 100644 --- a/tests/spec_decode/test_ngram_worker.py +++ b/tests/spec_decode/test_ngram_worker.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch diff --git a/tests/spec_decode/test_scorer.py b/tests/spec_decode/test_scorer.py index f73cf4b345fb2..55fcf00557476 100644 --- a/tests/spec_decode/test_scorer.py +++ b/tests/spec_decode/test_scorer.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import random diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py index f7ef9786a690e..8aceaadff8d38 100644 --- a/tests/spec_decode/test_spec_decode_worker.py +++ b/tests/spec_decode/test_spec_decode_worker.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import random from collections import defaultdict diff --git a/tests/spec_decode/test_utils.py b/tests/spec_decode/test_utils.py index 24573e22487d0..9cfc618b9d950 100644 --- a/tests/spec_decode/test_utils.py +++ b/tests/spec_decode/test_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from unittest.mock import MagicMock diff --git a/tests/spec_decode/utils.py b/tests/spec_decode/utils.py index d303b7f1219a5..1733f66feec07 100644 --- a/tests/spec_decode/utils.py +++ b/tests/spec_decode/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Sequence as GenericSequence from itertools import count diff --git a/tests/standalone_tests/lazy_imports.py b/tests/standalone_tests/lazy_imports.py index 61e3b387973bc..21bcb6b822d1f 100644 --- a/tests/standalone_tests/lazy_imports.py +++ b/tests/standalone_tests/lazy_imports.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Description: Test the lazy import module # The utility function cannot be placed in `vllm.utils` diff --git a/tests/tensorizer_loader/conftest.py b/tests/tensorizer_loader/conftest.py index ce8689f5b89c1..cd59d579e8d6f 100644 --- a/tests/tensorizer_loader/conftest.py +++ b/tests/tensorizer_loader/conftest.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest from vllm.distributed import cleanup_dist_env_and_memory diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py index 747ec56ad6298..c97f5968d58a2 100644 --- a/tests/tensorizer_loader/test_tensorizer.py +++ b/tests/tensorizer_loader/test_tensorizer.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import gc import os diff --git a/tests/test_cache_block_hashing.py b/tests/test_cache_block_hashing.py index 05d2c624df178..edc0849dff33f 100644 --- a/tests/test_cache_block_hashing.py +++ b/tests/test_cache_block_hashing.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Test hashing of cache blocks. Run `pytest tests/test_cache_block_hashing.py`. diff --git a/tests/test_config.py b/tests/test_config.py index 7db95e3f64502..dffea9138222d 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import MISSING, Field, asdict, dataclass, field from typing import Literal, Union diff --git a/tests/test_embedded_commit.py b/tests/test_embedded_commit.py index a9b4f5cbf78c3..b9593e2a3b7c0 100644 --- a/tests/test_embedded_commit.py +++ b/tests/test_embedded_commit.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import vllm diff --git a/tests/test_inputs.py b/tests/test_inputs.py index d361808ed2f9a..e549834faf6f7 100644 --- a/tests/test_inputs.py +++ b/tests/test_inputs.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/test_logger.py b/tests/test_logger.py index 046f70504c899..8f235f1474fe2 100644 --- a/tests/test_logger.py +++ b/tests/test_logger.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import enum import json import logging diff --git a/tests/test_outputs.py b/tests/test_outputs.py index c41bd6723ba11..4bb1c20f77f1d 100644 --- a/tests/test_outputs.py +++ b/tests/test_outputs.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm.outputs import RequestOutput diff --git a/tests/test_regression.py b/tests/test_regression.py index e092945422edb..f5f1ed8e805e0 100644 --- a/tests/test_regression.py +++ b/tests/test_regression.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Containing tests that check for regressions in vLLM's behavior. It should include tests that are reported by users and making sure they diff --git a/tests/test_sampling_params.py b/tests/test_sampling_params.py index 9af810c4c1bca..39e3808d831ca 100644 --- a/tests/test_sampling_params.py +++ b/tests/test_sampling_params.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests for the SamplingParams class. """ diff --git a/tests/test_scalartype.py b/tests/test_scalartype.py index eecfa1db3d7e5..ef4aef3afc2e2 100644 --- a/tests/test_scalartype.py +++ b/tests/test_scalartype.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch diff --git a/tests/test_seed_behavior.py b/tests/test_seed_behavior.py index c45ed6926d772..e9138b9e8eb61 100644 --- a/tests/test_seed_behavior.py +++ b/tests/test_seed_behavior.py @@ -1,4 +1,5 @@ -# SPDX-License-Identifier: Apache-2.0 +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import random import numpy as np diff --git a/tests/test_sequence.py b/tests/test_sequence.py index 902de1099e605..a782a3bf7716b 100644 --- a/tests/test_sequence.py +++ b/tests/test_sequence.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/test_sharded_state_loader.py b/tests/test_sharded_state_loader.py index 77fec0968000f..64706defb5960 100644 --- a/tests/test_sharded_state_loader.py +++ b/tests/test_sharded_state_loader.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import multiprocessing as mp import os diff --git a/tests/test_triton_utils.py b/tests/test_triton_utils.py index eb8ad48fdead4..64f72668f29ce 100644 --- a/tests/test_triton_utils.py +++ b/tests/test_triton_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import sys import types diff --git a/tests/test_utils.py b/tests/test_utils.py index 42e0df1ffb017..a2fd845ea54b7 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # ruff: noqa import asyncio diff --git a/tests/test_version.py b/tests/test_version.py index 56842b6d409d3..fd07abb59b1f8 100644 --- a/tests/test_version.py +++ b/tests/test_version.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from unittest.mock import patch diff --git a/tests/test_vllm_port.py b/tests/test_vllm_port.py index ccbb36bf4c06c..88e1efd8fdbb6 100644 --- a/tests/test_vllm_port.py +++ b/tests/test_vllm_port.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os from unittest.mock import patch diff --git a/tests/tokenization/test_cached_tokenizer.py b/tests/tokenization/test_cached_tokenizer.py index c740fde426360..e218678c4363b 100644 --- a/tests/tokenization/test_cached_tokenizer.py +++ b/tests/tokenization/test_cached_tokenizer.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pickle from copy import deepcopy diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py index 079100e78b5f0..b289dc972c89b 100644 --- a/tests/tokenization/test_detokenize.py +++ b/tests/tokenization/test_detokenize.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Generator from typing import Any, Optional diff --git a/tests/tokenization/test_get_eos.py b/tests/tokenization/test_get_eos.py index 8942f88912830..d8288429351c4 100644 --- a/tests/tokenization/test_get_eos.py +++ b/tests/tokenization/test_get_eos.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This test file includes some cases where it is inappropriate to only get the `eos_token_id` from the tokenizer as defined by diff --git a/tests/tokenization/test_mistral_tokenizer.py b/tests/tokenization/test_mistral_tokenizer.py index b16d9af35be98..69b3c6294284b 100644 --- a/tests/tokenization/test_mistral_tokenizer.py +++ b/tests/tokenization/test_mistral_tokenizer.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest from mistral_common.protocol.instruct.messages import (AssistantMessage, diff --git a/tests/tokenization/test_tokenizer.py b/tests/tokenization/test_tokenizer.py index eddc630986ea5..09a3638fd2ed1 100644 --- a/tests/tokenization/test_tokenizer.py +++ b/tests/tokenization/test_tokenizer.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest from transformers import PreTrainedTokenizerBase diff --git a/tests/tokenization/test_tokenizer_group.py b/tests/tokenization/test_tokenizer_group.py index bcfa78ed41cf5..0570c1525e111 100644 --- a/tests/tokenization/test_tokenizer_group.py +++ b/tests/tokenization/test_tokenizer_group.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest from transformers import AutoTokenizer, PreTrainedTokenizerBase diff --git a/tests/tokenization/test_tokenizer_registry.py b/tests/tokenization/test_tokenizer_registry.py index 772eeb345ca4d..5abb101644086 100644 --- a/tests/tokenization/test_tokenizer_registry.py +++ b/tests/tokenization/test_tokenizer_registry.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import TYPE_CHECKING, Any, Optional, Union diff --git a/tests/tool_use/conftest.py b/tests/tool_use/conftest.py index 4bf9b45fe212b..510b54790cd90 100644 --- a/tests/tool_use/conftest.py +++ b/tests/tool_use/conftest.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import pytest_asyncio diff --git a/tests/tool_use/test_chat_completion_request_validations.py b/tests/tool_use/test_chat_completion_request_validations.py index ba0ad78f64675..a30c58b09fe8f 100644 --- a/tests/tool_use/test_chat_completion_request_validations.py +++ b/tests/tool_use/test_chat_completion_request_validations.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/tool_use/test_chat_completions.py b/tests/tool_use/test_chat_completions.py index 448347be6ec1d..8c01c86e29f2f 100644 --- a/tests/tool_use/test_chat_completions.py +++ b/tests/tool_use/test_chat_completions.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import openai import pytest diff --git a/tests/tool_use/test_jamba_tool_parser.py b/tests/tool_use/test_jamba_tool_parser.py index a40675744ba24..35153139350bf 100644 --- a/tests/tool_use/test_jamba_tool_parser.py +++ b/tests/tool_use/test_jamba_tool_parser.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json from collections.abc import Generator diff --git a/tests/tool_use/test_parallel_tool_calls.py b/tests/tool_use/test_parallel_tool_calls.py index 910e0b2d51ab6..fff20c68d6212 100644 --- a/tests/tool_use/test_parallel_tool_calls.py +++ b/tests/tool_use/test_parallel_tool_calls.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json from typing import Optional diff --git a/tests/tool_use/test_tool_calls.py b/tests/tool_use/test_tool_calls.py index b320b335e338c..53ba03a0ae109 100644 --- a/tests/tool_use/test_tool_calls.py +++ b/tests/tool_use/test_tool_calls.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json from typing import Optional diff --git a/tests/tool_use/test_tool_choice_required.py b/tests/tool_use/test_tool_choice_required.py index 2917698481453..3b43b723d4387 100644 --- a/tests/tool_use/test_tool_choice_required.py +++ b/tests/tool_use/test_tool_choice_required.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json from copy import deepcopy from unittest.mock import MagicMock diff --git a/tests/tool_use/utils.py b/tests/tool_use/utils.py index efa6455c41df7..a17fab9aecbca 100644 --- a/tests/tool_use/utils.py +++ b/tests/tool_use/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from copy import deepcopy from typing import Any, Optional diff --git a/tests/tpu/lora/test_lora.py b/tests/tpu/lora/test_lora.py index 21d7fce691c95..b26bdd34d890e 100644 --- a/tests/tpu/lora/test_lora.py +++ b/tests/tpu/lora/test_lora.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import vllm diff --git a/tests/tpu/test_compilation.py b/tests/tpu/test_compilation.py index 06e00187caf46..3a180c6794ab9 100644 --- a/tests/tpu/test_compilation.py +++ b/tests/tpu/test_compilation.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import glob import os diff --git a/tests/tpu/test_custom_dispatcher.py b/tests/tpu/test_custom_dispatcher.py index acb6b90f5f7f6..9c90df1b77010 100644 --- a/tests/tpu/test_custom_dispatcher.py +++ b/tests/tpu/test_custom_dispatcher.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/tpu/test_moe_pallas.py b/tests/tpu/test_moe_pallas.py index 19df22f780396..ab6cd3069e1c9 100644 --- a/tests/tpu/test_moe_pallas.py +++ b/tests/tpu/test_moe_pallas.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests for the Pallas MOE implementation. Run `pytest tests/kernels/moe/test_moe_pallas.py`. diff --git a/tests/tpu/test_quantization_accuracy.py b/tests/tpu/test_quantization_accuracy.py index 20f9dd77d0e8d..a13cf7064d54b 100644 --- a/tests/tpu/test_quantization_accuracy.py +++ b/tests/tpu/test_quantization_accuracy.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass diff --git a/tests/tracing/test_tracing.py b/tests/tracing/test_tracing.py index caa233ec3ff9d..4dbae7c15de3a 100644 --- a/tests/tracing/test_tracing.py +++ b/tests/tracing/test_tracing.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # ruff: noqa # type: ignore from __future__ import annotations diff --git a/tests/utils.py b/tests/utils.py index d21b18470b1bb..ade28a481261c 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio import copy diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py index 61aee87529884..ad34becb1e8db 100644 --- a/tests/v1/core/test_kv_cache_utils.py +++ b/tests/v1/core/test_kv_cache_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import importlib import pytest diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py index 1a7a31d98506c..897d181ec9d5b 100644 --- a/tests/v1/core/test_prefix_caching.py +++ b/tests/v1/core/test_prefix_caching.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Compare the with and without prefix caching.""" from typing import Optional diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index f38454b1b2889..aa074f1bb37fb 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional from unittest.mock import Mock diff --git a/tests/v1/core/test_scheduler_e2e.py b/tests/v1/core/test_scheduler_e2e.py index 511d57d405ba2..85415f6ad4b69 100644 --- a/tests/v1/core/test_scheduler_e2e.py +++ b/tests/v1/core/test_scheduler_e2e.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os import pytest diff --git a/tests/v1/core/test_specialized_manager.py b/tests/v1/core/test_specialized_manager.py index 4217dc37e2df9..c6f7481ddde32 100644 --- a/tests/v1/core/test_specialized_manager.py +++ b/tests/v1/core/test_specialized_manager.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch diff --git a/tests/v1/e2e/test_cascade_attention.py b/tests/v1/e2e/test_cascade_attention.py index 48c265560348c..161bcd4d3ef9d 100644 --- a/tests/v1/e2e/test_cascade_attention.py +++ b/tests/v1/e2e/test_cascade_attention.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/v1/e2e/test_correctness_sliding_window.py b/tests/v1/e2e/test_correctness_sliding_window.py index a125d3fb79750..3eedc535d7f42 100644 --- a/tests/v1/e2e/test_correctness_sliding_window.py +++ b/tests/v1/e2e/test_correctness_sliding_window.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass import pytest diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py index 2fad37d6801bb..93e7c12f3a091 100644 --- a/tests/v1/e2e/test_spec_decode.py +++ b/tests/v1/e2e/test_spec_decode.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from __future__ import annotations import random diff --git a/tests/v1/engine/conftest.py b/tests/v1/engine/conftest.py index d04679c12448a..d7722142b207f 100644 --- a/tests/v1/engine/conftest.py +++ b/tests/v1/engine/conftest.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py index 5d52ad5f53280..957d50d0d9d85 100644 --- a/tests/v1/engine/test_async_llm.py +++ b/tests/v1/engine/test_async_llm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio from contextlib import ExitStack diff --git a/tests/v1/engine/test_engine_args.py b/tests/v1/engine/test_engine_args.py index 9b2f1a9199319..f70a3ce147ff2 100644 --- a/tests/v1/engine/test_engine_args.py +++ b/tests/v1/engine/test_engine_args.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from argparse import ArgumentError diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py index e78c7480a837a..3d7632a6037f7 100644 --- a/tests/v1/engine/test_engine_core.py +++ b/tests/v1/engine/test_engine_core.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import copy import time diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py index 8bea032f656fc..a01b205dfaed5 100644 --- a/tests/v1/engine/test_engine_core_client.py +++ b/tests/v1/engine/test_engine_core_client.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio import os diff --git a/tests/v1/engine/test_llm_engine.py b/tests/v1/engine/test_llm_engine.py index e77916f958233..6284dcfb915bc 100644 --- a/tests/v1/engine/test_llm_engine.py +++ b/tests/v1/engine/test_llm_engine.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import random from typing import Optional diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py index fac701c4ca35b..a83454ee67e73 100644 --- a/tests/v1/engine/test_output_processor.py +++ b/tests/v1/engine/test_output_processor.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import math import time diff --git a/tests/v1/engine/utils.py b/tests/v1/engine/utils.py index 4a23e0c1b212e..b58bc75fc9565 100644 --- a/tests/v1/engine/utils.py +++ b/tests/v1/engine/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import random from dataclasses import dataclass diff --git a/tests/v1/entrypoints/conftest.py b/tests/v1/entrypoints/conftest.py index 8c03f04330dd5..ffe0612124660 100644 --- a/tests/v1/entrypoints/conftest.py +++ b/tests/v1/entrypoints/conftest.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py index 5f1fff200de31..a39ab47b8d870 100644 --- a/tests/v1/entrypoints/llm/test_struct_output_generate.py +++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py @@ -1,5 +1,6 @@ # ruff: noqa: E501 # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from __future__ import annotations diff --git a/tests/v1/entrypoints/openai/test_chat_completion.py b/tests/v1/entrypoints/openai/test_chat_completion.py index c650ccd0ccd7d..dffb32846c05e 100644 --- a/tests/v1/entrypoints/openai/test_chat_completion.py +++ b/tests/v1/entrypoints/openai/test_chat_completion.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import openai # use the official client for correctness check import pytest diff --git a/tests/v1/entrypoints/openai/test_completion.py b/tests/v1/entrypoints/openai/test_completion.py index 333ad23795f34..a7c31c0642244 100644 --- a/tests/v1/entrypoints/openai/test_completion.py +++ b/tests/v1/entrypoints/openai/test_completion.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/tests/v1/entrypoints/openai/test_multi_api_servers.py b/tests/v1/entrypoints/openai/test_multi_api_servers.py index 7b4583bc3bf37..ed4ecbe8484c1 100644 --- a/tests/v1/entrypoints/openai/test_multi_api_servers.py +++ b/tests/v1/entrypoints/openai/test_multi_api_servers.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio import os diff --git a/tests/v1/kv_connector/nixl_integration/test_accuracy.py b/tests/v1/kv_connector/nixl_integration/test_accuracy.py index be2d84f3bb171..2b2b147ce3e1f 100644 --- a/tests/v1/kv_connector/nixl_integration/test_accuracy.py +++ b/tests/v1/kv_connector/nixl_integration/test_accuracy.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os import lm_eval diff --git a/tests/v1/kv_connector/nixl_integration/test_edge_cases.py b/tests/v1/kv_connector/nixl_integration/test_edge_cases.py index 5363fbde00962..95465a25fc9d2 100644 --- a/tests/v1/kv_connector/nixl_integration/test_edge_cases.py +++ b/tests/v1/kv_connector/nixl_integration/test_edge_cases.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os import openai diff --git a/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py b/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py index 13071f581375c..3d720fe0cafee 100644 --- a/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py +++ b/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import itertools diff --git a/tests/v1/kv_connector/unit/test_multi_connector.py b/tests/v1/kv_connector/unit/test_multi_connector.py index a21d92c52244d..ddf2836d08af4 100644 --- a/tests/v1/kv_connector/unit/test_multi_connector.py +++ b/tests/v1/kv_connector/unit/test_multi_connector.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import filecmp import shutil import tempfile diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py index 9b2a720c11c46..9b257143d69d2 100644 --- a/tests/v1/kv_connector/unit/test_nixl_connector.py +++ b/tests/v1/kv_connector/unit/test_nixl_connector.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector import ( NixlConnectorMetadata) diff --git a/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py b/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py index dc963251c962b..52dc21a2cdba2 100644 --- a/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py +++ b/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import copy from vllm.v1.outputs import EMPTY_MODEL_RUNNER_OUTPUT diff --git a/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py b/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py index 86eacb693869d..2312e21359083 100644 --- a/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py +++ b/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import copy from vllm.v1.outputs import EMPTY_MODEL_RUNNER_OUTPUT diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py index 3c3190b325636..e190e956170da 100644 --- a/tests/v1/kv_connector/unit/utils.py +++ b/tests/v1/kv_connector/unit/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any, Optional import torch diff --git a/tests/v1/metrics/test_ray_metrics.py b/tests/v1/metrics/test_ray_metrics.py index 02475f7c150b8..ea54038a2c775 100644 --- a/tests/v1/metrics/test_ray_metrics.py +++ b/tests/v1/metrics/test_ray_metrics.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import ray diff --git a/tests/v1/sample/test_logprobs.py b/tests/v1/sample/test_logprobs.py index 3800cb392fbad..612eca116f231 100644 --- a/tests/v1/sample/test_logprobs.py +++ b/tests/v1/sample/test_logprobs.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import itertools from collections.abc import Generator diff --git a/tests/v1/sample/test_logprobs_e2e.py b/tests/v1/sample/test_logprobs_e2e.py index f62770060160e..085b2ee09743c 100644 --- a/tests/v1/sample/test_logprobs_e2e.py +++ b/tests/v1/sample/test_logprobs_e2e.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import lm_eval diff --git a/tests/v1/sample/test_rejection_sampler.py b/tests/v1/sample/test_rejection_sampler.py index cbdb0b910d1dc..f35c3e194fa71 100644 --- a/tests/v1/sample/test_rejection_sampler.py +++ b/tests/v1/sample/test_rejection_sampler.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any, Optional import pytest diff --git a/tests/v1/sample/test_sampler.py b/tests/v1/sample/test_sampler.py index 24b759bc1fa60..a2beb5ad71dbb 100644 --- a/tests/v1/sample/test_sampler.py +++ b/tests/v1/sample/test_sampler.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/tests/v1/sample/test_sampling_params_e2e.py b/tests/v1/sample/test_sampling_params_e2e.py index 0512a1e026603..ac0f3eb58836f 100644 --- a/tests/v1/sample/test_sampling_params_e2e.py +++ b/tests/v1/sample/test_sampling_params_e2e.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os import pytest diff --git a/tests/v1/sample/test_topk_topp_sampler.py b/tests/v1/sample/test_topk_topp_sampler.py index 220f05c7ff1c3..63fdeb5a6de84 100644 --- a/tests/v1/sample/test_topk_topp_sampler.py +++ b/tests/v1/sample/test_topk_topp_sampler.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch from flashinfer.sampling import top_k_renorm_probs, top_p_renorm_probs diff --git a/tests/v1/sample/utils.py b/tests/v1/sample/utils.py index 932b652aea32b..8c111f846b47e 100644 --- a/tests/v1/sample/utils.py +++ b/tests/v1/sample/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from enum import Enum from typing import Optional diff --git a/tests/v1/shutdown/test_delete.py b/tests/v1/shutdown/test_delete.py index ed368fe828d07..682d84dc23d12 100644 --- a/tests/v1/shutdown/test_delete.py +++ b/tests/v1/shutdown/test_delete.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Test that we handle a startup Error and shutdown.""" import pytest diff --git a/tests/v1/shutdown/test_forward_error.py b/tests/v1/shutdown/test_forward_error.py index 9fedbe4f9a01a..523b7ee231151 100644 --- a/tests/v1/shutdown/test_forward_error.py +++ b/tests/v1/shutdown/test_forward_error.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Test that we handle an Error in model forward and shutdown.""" import asyncio diff --git a/tests/v1/shutdown/test_processor_error.py b/tests/v1/shutdown/test_processor_error.py index 0fe48da475c6a..a077d48fecbba 100644 --- a/tests/v1/shutdown/test_processor_error.py +++ b/tests/v1/shutdown/test_processor_error.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Test error handling in Processor. Should not impact other reqs.""" import asyncio diff --git a/tests/v1/shutdown/test_startup_error.py b/tests/v1/shutdown/test_startup_error.py index 1bba19102ec61..88fc5297aaf50 100644 --- a/tests/v1/shutdown/test_startup_error.py +++ b/tests/v1/shutdown/test_startup_error.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Test that we handle a startup Error and shutdown.""" import pytest diff --git a/tests/v1/shutdown/utils.py b/tests/v1/shutdown/utils.py index 8f7c0380d407f..124254a413377 100644 --- a/tests/v1/shutdown/utils.py +++ b/tests/v1/shutdown/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Shutdown test utils""" SHUTDOWN_TEST_TIMEOUT_SEC = 120 diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py index b49ac45f3129b..eff8eff43ea95 100644 --- a/tests/v1/spec_decode/test_eagle.py +++ b/tests/v1/spec_decode/test_eagle.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from unittest import mock diff --git a/tests/v1/spec_decode/test_max_len.py b/tests/v1/spec_decode/test_max_len.py index f577fb4ab3295..9070d2b10f8b5 100644 --- a/tests/v1/spec_decode/test_max_len.py +++ b/tests/v1/spec_decode/test_max_len.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Test whether spec decoding handles the max model length properly.""" import pytest diff --git a/tests/v1/spec_decode/test_ngram.py b/tests/v1/spec_decode/test_ngram.py index 50548219fff04..ffea86d0d19ca 100644 --- a/tests/v1/spec_decode/test_ngram.py +++ b/tests/v1/spec_decode/test_ngram.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import numpy as np diff --git a/tests/v1/structured_output/test_utils.py b/tests/v1/structured_output/test_utils.py index ffc0bceeee494..4e7c4b33e8c47 100644 --- a/tests/v1/structured_output/test_utils.py +++ b/tests/v1/structured_output/test_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/v1/test_async_llm_dp.py b/tests/v1/test_async_llm_dp.py index 366fa3b2561fd..53242180b21ef 100644 --- a/tests/v1/test_async_llm_dp.py +++ b/tests/v1/test_async_llm_dp.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio import os diff --git a/tests/v1/test_metrics_reader.py b/tests/v1/test_metrics_reader.py index 68539c80b59cc..c05de5e4cb645 100644 --- a/tests/v1/test_metrics_reader.py +++ b/tests/v1/test_metrics_reader.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import prometheus_client import pytest diff --git a/tests/v1/test_oracle.py b/tests/v1/test_oracle.py index 1b77417a1bd35..e5eadfd4e9dad 100644 --- a/tests/v1/test_oracle.py +++ b/tests/v1/test_oracle.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os import pytest diff --git a/tests/v1/test_serial_utils.py b/tests/v1/test_serial_utils.py index ee490071f6a27..0ab4e0bf59cf5 100644 --- a/tests/v1/test_serial_utils.py +++ b/tests/v1/test_serial_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections import UserDict from dataclasses import dataclass from typing import Optional diff --git a/tests/v1/test_utils.py b/tests/v1/test_utils.py index b68f08385866b..a3df882a9e29e 100644 --- a/tests/v1/test_utils.py +++ b/tests/v1/test_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch diff --git a/tests/v1/tpu/test_basic.py b/tests/v1/tpu/test_basic.py index 1c0210b6a814b..7117a66c29584 100644 --- a/tests/v1/tpu/test_basic.py +++ b/tests/v1/tpu/test_basic.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """A basic correctness check for TPUs Run `pytest tests/v1/tpu/test_basic.py`. diff --git a/tests/v1/tpu/test_mha_attn.py b/tests/v1/tpu/test_mha_attn.py index 01664598ccfde..55fee4ee1ad43 100644 --- a/tests/v1/tpu/test_mha_attn.py +++ b/tests/v1/tpu/test_mha_attn.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Test: diff --git a/tests/v1/tpu/test_multimodal.py b/tests/v1/tpu/test_multimodal.py index 8c87fc836b518..a61773a4f611b 100644 --- a/tests/v1/tpu/test_multimodal.py +++ b/tests/v1/tpu/test_multimodal.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import openai import pytest diff --git a/tests/v1/tpu/test_pallas.py b/tests/v1/tpu/test_pallas.py index 8faa5270b5930..3a9d80847a16b 100644 --- a/tests/v1/tpu/test_pallas.py +++ b/tests/v1/tpu/test_pallas.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from unittest.mock import ANY, patch import torch diff --git a/tests/v1/tpu/test_perf.py b/tests/v1/tpu/test_perf.py index 811833f73cdbc..f4a2d5ac853a8 100644 --- a/tests/v1/tpu/test_perf.py +++ b/tests/v1/tpu/test_perf.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """A basic performance regression test for TPUs Run `pytest tests/v1/tpu/test_perf.py`. diff --git a/tests/v1/tpu/test_sampler.py b/tests/v1/tpu/test_sampler.py index 2bbeb3ddac91b..198bb1e16ed9f 100644 --- a/tests/v1/tpu/test_sampler.py +++ b/tests/v1/tpu/test_sampler.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import random import pytest diff --git a/tests/v1/tpu/test_topk_topp_sampler.py b/tests/v1/tpu/test_topk_topp_sampler.py index ff9217f8f3cab..ca5c067b364e0 100644 --- a/tests/v1/tpu/test_topk_topp_sampler.py +++ b/tests/v1/tpu/test_topk_topp_sampler.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import math import pytest diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py index 348f12887a446..230c97e787a98 100644 --- a/tests/v1/tpu/worker/test_tpu_model_runner.py +++ b/tests/v1/tpu/worker/test_tpu_model_runner.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import unittest.mock as mock import pytest diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py index 27741bd156be1..e932e4b323498 100644 --- a/tests/v1/worker/test_gpu_input_batch.py +++ b/tests/v1/worker/test_gpu_input_batch.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import inspect from typing import Optional diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py index 6ba6d1f6f131d..ceb9d4df25e62 100644 --- a/tests/v1/worker/test_gpu_model_runner.py +++ b/tests/v1/worker/test_gpu_model_runner.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import random diff --git a/tests/vllm_test_utils/setup.py b/tests/vllm_test_utils/setup.py index c039431494c4e..83be8bdce85cf 100644 --- a/tests/vllm_test_utils/setup.py +++ b/tests/vllm_test_utils/setup.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from setuptools import setup diff --git a/tests/vllm_test_utils/vllm_test_utils/__init__.py b/tests/vllm_test_utils/vllm_test_utils/__init__.py index 1d1219fbeffa1..2818428de4a73 100644 --- a/tests/vllm_test_utils/vllm_test_utils/__init__.py +++ b/tests/vllm_test_utils/vllm_test_utils/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ vllm_utils is a package for vLLM testing utilities. It does not import any vLLM modules. diff --git a/tests/vllm_test_utils/vllm_test_utils/blame.py b/tests/vllm_test_utils/vllm_test_utils/blame.py index 3b25980cb9463..49fd083ef19c8 100644 --- a/tests/vllm_test_utils/vllm_test_utils/blame.py +++ b/tests/vllm_test_utils/vllm_test_utils/blame.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import contextlib import dataclasses diff --git a/tests/vllm_test_utils/vllm_test_utils/monitor.py b/tests/vllm_test_utils/vllm_test_utils/monitor.py index 27077f13de24f..9454221b273e6 100644 --- a/tests/vllm_test_utils/vllm_test_utils/monitor.py +++ b/tests/vllm_test_utils/vllm_test_utils/monitor.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import contextlib import dataclasses diff --git a/tests/weight_loading/test_weight_loading.py b/tests/weight_loading/test_weight_loading.py index 9f99b3725fe41..3aabae099073e 100644 --- a/tests/weight_loading/test_weight_loading.py +++ b/tests/weight_loading/test_weight_loading.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os diff --git a/tests/worker/conftest.py b/tests/worker/conftest.py index 372d71a78d0a7..3f202d4dbe948 100644 --- a/tests/worker/conftest.py +++ b/tests/worker/conftest.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/worker/test_encoder_decoder_model_runner.py b/tests/worker/test_encoder_decoder_model_runner.py index 3e237aacc8c60..35ac90b38e840 100644 --- a/tests/worker/test_encoder_decoder_model_runner.py +++ b/tests/worker/test_encoder_decoder_model_runner.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import itertools diff --git a/tests/worker/test_model_input.py b/tests/worker/test_model_input.py index a41fc52170fee..a5e61128d1e93 100644 --- a/tests/worker/test_model_input.py +++ b/tests/worker/test_model_input.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import dataclasses diff --git a/tests/worker/test_model_runner.py b/tests/worker/test_model_runner.py index ae4b536524be0..0be25aa2fc35d 100644 --- a/tests/worker/test_model_runner.py +++ b/tests/worker/test_model_runner.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch diff --git a/tests/worker/test_profile.py b/tests/worker/test_profile.py index 22466105b8aba..d8767f700b576 100644 --- a/tests/worker/test_profile.py +++ b/tests/worker/test_profile.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch diff --git a/tests/worker/test_swap.py b/tests/worker/test_swap.py index 3ab8070999b00..6d9f404ac207b 100644 --- a/tests/worker/test_swap.py +++ b/tests/worker/test_swap.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch diff --git a/tools/check_spdx_header.py b/tools/check_spdx_header.py index 709befc53207c..92914186b16e0 100644 --- a/tools/check_spdx_header.py +++ b/tools/check_spdx_header.py @@ -1,8 +1,11 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import sys -SPDX_HEADER = "# SPDX-License-Identifier: Apache-2.0" +SPDX_HEADER = ( + "# SPDX-License-Identifier: Apache-2.0\n" + "# SPDX-FileCopyrightText: Copyright contributors to the vLLM project") SPDX_HEADER_PREFIX = "# SPDX-License-Identifier:" diff --git a/tools/check_triton_import.py b/tools/check_triton_import.py index 18c9726a11ac0..77b2dfc391889 100644 --- a/tools/check_triton_import.py +++ b/tools/check_triton_import.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import subprocess import sys diff --git a/tools/enforce_regex_import.py b/tools/enforce_regex_import.py index 6c201dd2543e9..63ceee5829aba 100644 --- a/tools/enforce_regex_import.py +++ b/tools/enforce_regex_import.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from __future__ import annotations import subprocess diff --git a/tools/profiler/print_layerwise_table.py b/tools/profiler/print_layerwise_table.py index 9601b578eb97c..209c3a576aeed 100644 --- a/tools/profiler/print_layerwise_table.py +++ b/tools/profiler/print_layerwise_table.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import json diff --git a/tools/profiler/visualize_layerwise_profile.py b/tools/profiler/visualize_layerwise_profile.py index 8ec3dfc97a734..038d3c44f043a 100644 --- a/tools/profiler/visualize_layerwise_profile.py +++ b/tools/profiler/visualize_layerwise_profile.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import copy diff --git a/tools/report_build_time_ninja.py b/tools/report_build_time_ninja.py index 011af25229f4b..7368ae95313d2 100644 --- a/tools/report_build_time_ninja.py +++ b/tools/report_build_time_ninja.py @@ -1,5 +1,6 @@ #!/usr/bin/env python3 # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright (c) 2018 The Chromium Authors. All rights reserved. # Use of this source code is governed by a BSD-style license that can be diff --git a/use_existing_torch.py b/use_existing_torch.py index 7d352c6ca6fa7..a9f79e16981c4 100644 --- a/use_existing_torch.py +++ b/use_existing_torch.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import glob diff --git a/vllm/__init__.py b/vllm/__init__.py index 52022fb8f0168..6232b657e8284 100644 --- a/vllm/__init__.py +++ b/vllm/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """vLLM: a high-throughput and memory-efficient inference engine for LLMs""" # The version.py should be independent library, and we always import the # version library first. Such assumption is critical for some customization. diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 3c8e6b95ce763..008a7aa94939b 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import contextlib import importlib diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py index a9a624b85abc5..ae63e06030dd1 100644 --- a/vllm/_ipex_ops.py +++ b/vllm/_ipex_ops.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/vllm/adapter_commons/layers.py b/vllm/adapter_commons/layers.py index 9cc2b181fc7cc..9753a08806565 100644 --- a/vllm/adapter_commons/layers.py +++ b/vllm/adapter_commons/layers.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass diff --git a/vllm/adapter_commons/models.py b/vllm/adapter_commons/models.py index a84fbea2e444a..7b685880a9e6c 100644 --- a/vllm/adapter_commons/models.py +++ b/vllm/adapter_commons/models.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import ABC, abstractmethod from typing import Any, Callable, Optional, TypeVar diff --git a/vllm/adapter_commons/request.py b/vllm/adapter_commons/request.py index 2b604b91bbb6b..8135b54ba19f6 100644 --- a/vllm/adapter_commons/request.py +++ b/vllm/adapter_commons/request.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import ABC, abstractmethod diff --git a/vllm/adapter_commons/utils.py b/vllm/adapter_commons/utils.py index 46e9629e1f55f..a1a56b6bbd4ba 100644 --- a/vllm/adapter_commons/utils.py +++ b/vllm/adapter_commons/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any, Callable, Optional diff --git a/vllm/adapter_commons/worker_manager.py b/vllm/adapter_commons/worker_manager.py index 3c1d26404c990..07e85d138ac50 100644 --- a/vllm/adapter_commons/worker_manager.py +++ b/vllm/adapter_commons/worker_manager.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import ABC, abstractmethod from typing import Any, Optional diff --git a/vllm/assets/audio.py b/vllm/assets/audio.py index a21eb7f599faa..1c16230849bca 100644 --- a/vllm/assets/audio.py +++ b/vllm/assets/audio.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass from pathlib import Path diff --git a/vllm/assets/base.py b/vllm/assets/base.py index 03f3b9dabf143..31cde431b5b6a 100644 --- a/vllm/assets/base.py +++ b/vllm/assets/base.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from functools import lru_cache from pathlib import Path diff --git a/vllm/assets/image.py b/vllm/assets/image.py index d8cca9b74edd5..c977242a3d484 100644 --- a/vllm/assets/image.py +++ b/vllm/assets/image.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass from typing import Literal diff --git a/vllm/assets/video.py b/vllm/assets/video.py index bf06746a9ff66..01834aeeb6c12 100644 --- a/vllm/assets/video.py +++ b/vllm/assets/video.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass from functools import lru_cache diff --git a/vllm/attention/__init__.py b/vllm/attention/__init__.py index 85c5715faba7f..344040586a532 100644 --- a/vllm/attention/__init__.py +++ b/vllm/attention/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm.attention.backends.abstract import (AttentionBackend, AttentionMetadata, diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py index f3d6ffaeb8f45..deb3951d6617b 100644 --- a/vllm/attention/backends/abstract.py +++ b/vllm/attention/backends/abstract.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import ABC, abstractmethod from contextlib import contextmanager diff --git a/vllm/attention/backends/blocksparse_attn.py b/vllm/attention/backends/blocksparse_attn.py index ea4f840729b48..a2fd557f8e0cb 100644 --- a/vllm/attention/backends/blocksparse_attn.py +++ b/vllm/attention/backends/blocksparse_attn.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass, field from typing import Any, Dict, List, Optional, Tuple, Type diff --git a/vllm/attention/backends/cpu_mla.py b/vllm/attention/backends/cpu_mla.py index 4567893a9ef7c..39e667bca9cd2 100644 --- a/vllm/attention/backends/cpu_mla.py +++ b/vllm/attention/backends/cpu_mla.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass from typing import Any, Dict, List, Optional, Tuple, Type diff --git a/vllm/attention/backends/dual_chunk_flash_attn.py b/vllm/attention/backends/dual_chunk_flash_attn.py index eceab1f1ac9a3..3548df88d0c5d 100644 --- a/vllm/attention/backends/dual_chunk_flash_attn.py +++ b/vllm/attention/backends/dual_chunk_flash_attn.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Attention layer with Dual chunk flash attention and sparse attention. """ import math diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py index 7f8f720eee0ae..26be2c04f297e 100755 --- a/vllm/attention/backends/flash_attn.py +++ b/vllm/attention/backends/flash_attn.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Attention layer with FlashAttention.""" from collections import defaultdict from dataclasses import dataclass diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py index 37b20d0739f70..7ae7ea37f4afc 100644 --- a/vllm/attention/backends/flashinfer.py +++ b/vllm/attention/backends/flashinfer.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import dataclasses import os diff --git a/vllm/attention/backends/flashmla.py b/vllm/attention/backends/flashmla.py index 0e62748ddbee4..9a6b8a40e1311 100644 --- a/vllm/attention/backends/flashmla.py +++ b/vllm/attention/backends/flashmla.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from contextlib import contextmanager from dataclasses import dataclass diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py index d701c59a234f8..5128e49752e11 100644 --- a/vllm/attention/backends/hpu_attn.py +++ b/vllm/attention/backends/hpu_attn.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project ############################################################################### # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py index f322c7b3dd6a2..30441b3ad136a 100644 --- a/vllm/attention/backends/ipex_attn.py +++ b/vllm/attention/backends/ipex_attn.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Attention layer with torch scaled_dot_product_attention and PagedAttention.""" from dataclasses import dataclass diff --git a/vllm/attention/backends/mla/common.py b/vllm/attention/backends/mla/common.py index 1007140ef3863..50842abd3924f 100644 --- a/vllm/attention/backends/mla/common.py +++ b/vllm/attention/backends/mla/common.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ # MLA Common Components diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py index 19642a939b481..a6823ac059fb7 100644 --- a/vllm/attention/backends/pallas.py +++ b/vllm/attention/backends/pallas.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass from typing import Any, Dict, List, Optional, Tuple, Type diff --git a/vllm/attention/backends/placeholder_attn.py b/vllm/attention/backends/placeholder_attn.py index f1def25c89cff..820ddcab77d71 100644 --- a/vllm/attention/backends/placeholder_attn.py +++ b/vllm/attention/backends/placeholder_attn.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections import defaultdict from dataclasses import dataclass diff --git a/vllm/attention/backends/rocm_aiter_mla.py b/vllm/attention/backends/rocm_aiter_mla.py index c974f2a15a0ef..855036071d0d1 100644 --- a/vllm/attention/backends/rocm_aiter_mla.py +++ b/vllm/attention/backends/rocm_aiter_mla.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from contextlib import contextmanager from dataclasses import dataclass diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py index 7134472daa605..755e0da06cef9 100644 --- a/vllm/attention/backends/rocm_flash_attn.py +++ b/vllm/attention/backends/rocm_flash_attn.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Attention layer ROCm GPUs.""" import itertools from dataclasses import dataclass diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py index c1bd638f2605d..7606340044f1d 100644 --- a/vllm/attention/backends/torch_sdpa.py +++ b/vllm/attention/backends/torch_sdpa.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Attention layer with torch scaled_dot_product_attention and PagedAttention.""" from dataclasses import dataclass diff --git a/vllm/attention/backends/triton_mla.py b/vllm/attention/backends/triton_mla.py index 6945c2c6e29cd..d9fff8fac1584 100644 --- a/vllm/attention/backends/triton_mla.py +++ b/vllm/attention/backends/triton_mla.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any, Dict, List, Optional, Type diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py index a281c9771a82e..e3f02a193614a 100644 --- a/vllm/attention/backends/utils.py +++ b/vllm/attention/backends/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Attention backend utils""" from collections import defaultdict from contextlib import contextmanager diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py index a9d4a70b55a8c..8355e03977e78 100644 --- a/vllm/attention/backends/xformers.py +++ b/vllm/attention/backends/xformers.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Attention layer with xFormers and PagedAttention.""" from dataclasses import dataclass from typing import Any, Dict, List, Optional, Tuple, Type diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 9e4fbe0b4c6c2..6c5b05a5c7b14 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Attention layer.""" from typing import Any, Dict, List, Optional diff --git a/vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py b/vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py index bc87ce33a3015..05fa9d11f2283 100644 --- a/vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py +++ b/vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch diff --git a/vllm/attention/ops/blocksparse_attention/interface.py b/vllm/attention/ops/blocksparse_attention/interface.py index 6ab69ea5b4098..c6f6cc29793f4 100644 --- a/vllm/attention/ops/blocksparse_attention/interface.py +++ b/vllm/attention/ops/blocksparse_attention/interface.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import math diff --git a/vllm/attention/ops/blocksparse_attention/utils.py b/vllm/attention/ops/blocksparse_attention/utils.py index e64fc1139713e..445720c709c47 100644 --- a/vllm/attention/ops/blocksparse_attention/utils.py +++ b/vllm/attention/ops/blocksparse_attention/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Helper functions for 3D sparse pattern # These function are not optimized and very inefficient. diff --git a/vllm/attention/ops/chunked_prefill_paged_decode.py b/vllm/attention/ops/chunked_prefill_paged_decode.py index 6ca2a64145bd6..4f839348e5222 100644 --- a/vllm/attention/ops/chunked_prefill_paged_decode.py +++ b/vllm/attention/ops/chunked_prefill_paged_decode.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Authors: # - Burkhard Ringlein diff --git a/vllm/attention/ops/flashmla.py b/vllm/attention/ops/flashmla.py index 18b69a6b3ddf8..b85f27ac417cf 100644 --- a/vllm/attention/ops/flashmla.py +++ b/vllm/attention/ops/flashmla.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # adapted from: https://github.com/deepseek-ai/FlashMLA/blob/main/flash_mla/flash_mla_interface.py from typing import Optional, Tuple diff --git a/vllm/attention/ops/hpu_paged_attn.py b/vllm/attention/ops/hpu_paged_attn.py index a97c36338d3c5..412dd20ec1deb 100644 --- a/vllm/attention/ops/hpu_paged_attn.py +++ b/vllm/attention/ops/hpu_paged_attn.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project ############################################################################### # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company diff --git a/vllm/attention/ops/ipex_attn.py b/vllm/attention/ops/ipex_attn.py index 1702203b18346..b7e4ba4d7416a 100644 --- a/vllm/attention/ops/ipex_attn.py +++ b/vllm/attention/ops/ipex_attn.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Dict, List, Optional, Tuple diff --git a/vllm/attention/ops/merge_attn_states.py b/vllm/attention/ops/merge_attn_states.py index f9fcfe6a63386..5cb1a47394cf6 100644 --- a/vllm/attention/ops/merge_attn_states.py +++ b/vllm/attention/ops/merge_attn_states.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional import torch diff --git a/vllm/attention/ops/nki_flash_attn.py b/vllm/attention/ops/nki_flash_attn.py index 8c9145bb99e8c..e28ff7e8b4ed9 100644 --- a/vllm/attention/ops/nki_flash_attn.py +++ b/vllm/attention/ops/nki_flash_attn.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import neuronxcc.nki.isa as nisa import neuronxcc.nki.language as nl diff --git a/vllm/attention/ops/paged_attn.py b/vllm/attention/ops/paged_attn.py index 827c3041a682e..c6d1501e27578 100644 --- a/vllm/attention/ops/paged_attn.py +++ b/vllm/attention/ops/paged_attn.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass from typing import List, Optional, Tuple diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py index 729b61b029063..13bef96722d2b 100644 --- a/vllm/attention/ops/prefix_prefill.py +++ b/vllm/attention/ops/prefix_prefill.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # The kernels in this file are adapted from LightLLM's context_attention_fwd: # https://github.com/ModelTC/lightllm/blob/main/lightllm/models/llama/triton_kernel/context_flashattention_nopad.py diff --git a/vllm/attention/ops/rocm_aiter_mla.py b/vllm/attention/ops/rocm_aiter_mla.py index 421891ab6b733..cce6b46394606 100644 --- a/vllm/attention/ops/rocm_aiter_mla.py +++ b/vllm/attention/ops/rocm_aiter_mla.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/vllm/attention/ops/rocm_aiter_paged_attn.py b/vllm/attention/ops/rocm_aiter_paged_attn.py index 0f3cf1842c805..ad97152e208b8 100644 --- a/vllm/attention/ops/rocm_aiter_paged_attn.py +++ b/vllm/attention/ops/rocm_aiter_paged_attn.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional import aiter as rocm_aiter diff --git a/vllm/attention/ops/triton_decode_attention.py b/vllm/attention/ops/triton_decode_attention.py index fb983907e375e..c27b377aebe99 100644 --- a/vllm/attention/ops/triton_decode_attention.py +++ b/vllm/attention/ops/triton_decode_attention.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/sgl-project/sglang/blob/9f635ea50de920aa507f486daafba26a5b837574/python/sglang/srt/layers/attention/triton_ops/decode_attention.py diff --git a/vllm/attention/ops/triton_flash_attention.py b/vllm/attention/ops/triton_flash_attention.py index 62cfb813d5f94..a26e713b1c624 100644 --- a/vllm/attention/ops/triton_flash_attention.py +++ b/vllm/attention/ops/triton_flash_attention.py @@ -1,5 +1,6 @@ #!/usr/bin/env python # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Fused Attention =============== diff --git a/vllm/attention/ops/triton_merge_attn_states.py b/vllm/attention/ops/triton_merge_attn_states.py index 30e61b6d82639..56d78ed5ea6ee 100644 --- a/vllm/attention/ops/triton_merge_attn_states.py +++ b/vllm/attention/ops/triton_merge_attn_states.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional import torch diff --git a/vllm/attention/ops/triton_unified_attention.py b/vllm/attention/ops/triton_unified_attention.py index 87cf333f7f0a1..92c09e6dd0640 100644 --- a/vllm/attention/ops/triton_unified_attention.py +++ b/vllm/attention/ops/triton_unified_attention.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Authors: # - Burkhard Ringlein diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py index ebbdea27f413e..cb577fa673023 100644 --- a/vllm/attention/selector.py +++ b/vllm/attention/selector.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os from contextlib import contextmanager diff --git a/vllm/attention/utils/fa_utils.py b/vllm/attention/utils/fa_utils.py index ca88549f3f729..69cde06fd72e9 100644 --- a/vllm/attention/utils/fa_utils.py +++ b/vllm/attention/utils/fa_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional from vllm import envs diff --git a/vllm/beam_search.py b/vllm/beam_search.py index ddacc669551b9..f3bc4218323d8 100644 --- a/vllm/beam_search.py +++ b/vllm/beam_search.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass from typing import TYPE_CHECKING, Any, Optional, Union diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index 21fe3eb629e21..0ef3e0254cc4f 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This module defines a framework for sampling benchmark requests from various datasets. Each dataset subclass of BenchmarkDataset must implement sample diff --git a/vllm/benchmarks/endpoint_request_func.py b/vllm/benchmarks/endpoint_request_func.py index a28630d50f261..aba60edc58cbf 100644 --- a/vllm/benchmarks/endpoint_request_func.py +++ b/vllm/benchmarks/endpoint_request_func.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """The request function for API endpoints.""" import io diff --git a/vllm/benchmarks/latency.py b/vllm/benchmarks/latency.py index dc1c42879b2cf..5c6124db80b4f 100644 --- a/vllm/benchmarks/latency.py +++ b/vllm/benchmarks/latency.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Benchmark the latency of processing a single batch of requests.""" import argparse diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py index 858a0c6a00e4b..019ebcf8d5041 100644 --- a/vllm/benchmarks/serve.py +++ b/vllm/benchmarks/serve.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project r"""Benchmark online serving throughput. On the server side, run one of the following commands diff --git a/vllm/benchmarks/throughput.py b/vllm/benchmarks/throughput.py index 3ea6c194baa8a..be9ea39f0c38e 100644 --- a/vllm/benchmarks/throughput.py +++ b/vllm/benchmarks/throughput.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Benchmark offline inference throughput.""" import argparse import dataclasses diff --git a/vllm/benchmarks/utils.py b/vllm/benchmarks/utils.py index 45a0ddbd5d08d..f0bb99326ab40 100644 --- a/vllm/benchmarks/utils.py +++ b/vllm/benchmarks/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import json diff --git a/vllm/collect_env.py b/vllm/collect_env.py index 86eb465b8f658..64172a9bf91d2 100644 --- a/vllm/collect_env.py +++ b/vllm/collect_env.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # ruff: noqa # code borrowed from https://github.com/pytorch/pytorch/blob/main/torch/utils/collect_env.py @@ -6,7 +7,6 @@ import datetime import locale import os -import re import subprocess import sys # Unlike the rest of the PyTorch this file must be python2 compliant. @@ -14,6 +14,8 @@ import sys # Run it with `python collect_env.py` or `python -m torch.utils.collect_env` from collections import namedtuple +import regex as re + from vllm.envs import environment_variables try: @@ -815,4 +817,4 @@ def main(): if __name__ == '__main__': - main() \ No newline at end of file + main() diff --git a/vllm/compilation/activation_quant_fusion.py b/vllm/compilation/activation_quant_fusion.py index dc3e1482e2b48..ce4e50a2b02d1 100644 --- a/vllm/compilation/activation_quant_fusion.py +++ b/vllm/compilation/activation_quant_fusion.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch from torch._higher_order_ops.auto_functionalize import auto_functionalized diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index c4bfffe929970..5af3b7efed2d6 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import ast import dataclasses diff --git a/vllm/compilation/base_piecewise_backend.py b/vllm/compilation/base_piecewise_backend.py index 84d1e1f77739e..4d7aeeb4d03e3 100644 --- a/vllm/compilation/base_piecewise_backend.py +++ b/vllm/compilation/base_piecewise_backend.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any, Callable, Protocol diff --git a/vllm/compilation/collective_fusion.py b/vllm/compilation/collective_fusion.py index f651ee6912abb..f754fc2388b20 100644 --- a/vllm/compilation/collective_fusion.py +++ b/vllm/compilation/collective_fusion.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional import torch diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py index 9293610cc2469..36c810ec2dc96 100644 --- a/vllm/compilation/compiler_interface.py +++ b/vllm/compilation/compiler_interface.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import contextlib import copy import hashlib diff --git a/vllm/compilation/counter.py b/vllm/compilation/counter.py index 2200671b8848b..c584c103f4410 100644 --- a/vllm/compilation/counter.py +++ b/vllm/compilation/counter.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import copy import dataclasses diff --git a/vllm/compilation/cuda_piecewise_backend.py b/vllm/compilation/cuda_piecewise_backend.py index 0ad480e28cd70..8bf957368f6ab 100644 --- a/vllm/compilation/cuda_piecewise_backend.py +++ b/vllm/compilation/cuda_piecewise_backend.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import dataclasses from contextlib import ExitStack diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py index f02994c55527d..05e4ca9f08b36 100644 --- a/vllm/compilation/decorators.py +++ b/vllm/compilation/decorators.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import inspect from typing import Callable, Optional, TypeVar, Union, overload diff --git a/vllm/compilation/fix_functionalization.py b/vllm/compilation/fix_functionalization.py index 70f3b8b6df94b..286221d32c1ee 100644 --- a/vllm/compilation/fix_functionalization.py +++ b/vllm/compilation/fix_functionalization.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import operator from collections.abc import Iterable diff --git a/vllm/compilation/fusion.py b/vllm/compilation/fusion.py index 618b2fe94d3a0..7e2c5b4fe66a6 100644 --- a/vllm/compilation/fusion.py +++ b/vllm/compilation/fusion.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Callable, NamedTuple, Optional diff --git a/vllm/compilation/fx_utils.py b/vllm/compilation/fx_utils.py index b9eeb0c8d2af3..9ef3889323887 100644 --- a/vllm/compilation/fx_utils.py +++ b/vllm/compilation/fx_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import operator from collections.abc import Iterable diff --git a/vllm/compilation/inductor_pass.py b/vllm/compilation/inductor_pass.py index a9359fe1e1170..810d0801e9f38 100644 --- a/vllm/compilation/inductor_pass.py +++ b/vllm/compilation/inductor_pass.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import hashlib import inspect diff --git a/vllm/compilation/monitor.py b/vllm/compilation/monitor.py index 786c7c1e1859a..1e059b59fb64d 100644 --- a/vllm/compilation/monitor.py +++ b/vllm/compilation/monitor.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os import time diff --git a/vllm/compilation/multi_output_match.py b/vllm/compilation/multi_output_match.py index cef19f9257ed7..6d1893777cec6 100644 --- a/vllm/compilation/multi_output_match.py +++ b/vllm/compilation/multi_output_match.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import abc import operator diff --git a/vllm/compilation/noop_elimination.py b/vllm/compilation/noop_elimination.py index 13e4cd73f8ce7..46f70dcdc6886 100644 --- a/vllm/compilation/noop_elimination.py +++ b/vllm/compilation/noop_elimination.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable from typing import Union diff --git a/vllm/compilation/pass_manager.py b/vllm/compilation/pass_manager.py index 07ebd3e1b7dde..621c89a144874 100644 --- a/vllm/compilation/pass_manager.py +++ b/vllm/compilation/pass_manager.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from torch import fx as fx diff --git a/vllm/compilation/sequence_parallelism.py b/vllm/compilation/sequence_parallelism.py index 17dded87fe8dc..d41093903480b 100644 --- a/vllm/compilation/sequence_parallelism.py +++ b/vllm/compilation/sequence_parallelism.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional import torch diff --git a/vllm/compilation/torch25_custom_graph_pass.py b/vllm/compilation/torch25_custom_graph_pass.py index 4b881d0b6f2da..cd3970657522e 100644 --- a/vllm/compilation/torch25_custom_graph_pass.py +++ b/vllm/compilation/torch25_custom_graph_pass.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import ABC, abstractmethod from typing import Any, Optional diff --git a/vllm/compilation/vllm_inductor_pass.py b/vllm/compilation/vllm_inductor_pass.py index 0fe73b72b1dee..3ccbf52d9fd38 100644 --- a/vllm/compilation/vllm_inductor_pass.py +++ b/vllm/compilation/vllm_inductor_pass.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import time diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py index 1a8211f0ab7c6..8c8d0b5cb2291 100644 --- a/vllm/compilation/wrapper.py +++ b/vllm/compilation/wrapper.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os import sys diff --git a/vllm/config.py b/vllm/config.py index 8aa1b56103004..d99e501ca279a 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import ast import copy diff --git a/vllm/connections.py b/vllm/connections.py index 84e32a4d5ca9c..103505eb3d81f 100644 --- a/vllm/connections.py +++ b/vllm/connections.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Mapping, MutableMapping from pathlib import Path diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py index d4d31c58dc8d4..444bb25f2830a 100644 --- a/vllm/core/block/block_table.py +++ b/vllm/core/block/block_table.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import math from typing import List, Optional diff --git a/vllm/core/block/common.py b/vllm/core/block/common.py index 1966eac1cf9e0..a337007a9eaa6 100644 --- a/vllm/core/block/common.py +++ b/vllm/core/block/common.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections import deque from dataclasses import dataclass diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py index d64142e77f37f..ea490c32791c7 100644 --- a/vllm/core/block/cpu_gpu_block_allocator.py +++ b/vllm/core/block/cpu_gpu_block_allocator.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Dict, FrozenSet, List, Optional, Tuple diff --git a/vllm/core/block/interfaces.py b/vllm/core/block/interfaces.py index 301656996435b..1a05881f7c005 100644 --- a/vllm/core/block/interfaces.py +++ b/vllm/core/block/interfaces.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import ABC, abstractmethod from typing import Dict, FrozenSet, List, Optional, Protocol, Tuple diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py index c388366b825f2..dae6ead04e9c9 100644 --- a/vllm/core/block/naive_block.py +++ b/vllm/core/block/naive_block.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections import deque from typing import Deque, FrozenSet, Iterable, List, Optional, Tuple, Union diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py index 1ca9e49dac371..2913a01bf34a5 100644 --- a/vllm/core/block/prefix_caching_block.py +++ b/vllm/core/block/prefix_caching_block.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Token blocks.""" import sys from bisect import bisect_left diff --git a/vllm/core/block/utils.py b/vllm/core/block/utils.py index 910afdd9feff1..e933c6ee7c8bd 100644 --- a/vllm/core/block/utils.py +++ b/vllm/core/block/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Block manager utils.""" from vllm.sequence import SequenceGroup from vllm.utils import (STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE, diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index c6bf6d163132e..a33399204fafa 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """A block manager that manages token blocks.""" from typing import Dict, List, Optional from typing import Sequence as GenericSequence diff --git a/vllm/core/evictor.py b/vllm/core/evictor.py index 0e363eddc8a5e..7ec4768e90b1a 100644 --- a/vllm/core/evictor.py +++ b/vllm/core/evictor.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import enum import heapq diff --git a/vllm/core/interfaces.py b/vllm/core/interfaces.py index 4c1182debcec1..ba290eeda12b5 100644 --- a/vllm/core/interfaces.py +++ b/vllm/core/interfaces.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import enum from abc import ABC, abstractmethod diff --git a/vllm/core/placeholder_block_space_manager.py b/vllm/core/placeholder_block_space_manager.py index 0f5d8ca6dc7ea..71b22942a3edd 100644 --- a/vllm/core/placeholder_block_space_manager.py +++ b/vllm/core/placeholder_block_space_manager.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import List, Optional, Tuple diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index 06d4ed470b209..44be855b1bfde 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import enum import os diff --git a/vllm/device_allocator/cumem.py b/vllm/device_allocator/cumem.py index 6fcbca628c6aa..942e866ed97ee 100644 --- a/vllm/device_allocator/cumem.py +++ b/vllm/device_allocator/cumem.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # cumem-based pytorch pluggable allocator to implement sleep mode. # other approaches tried but failed: diff --git a/vllm/distributed/__init__.py b/vllm/distributed/__init__.py index 39955ddacfe94..e911b2a1ab284 100644 --- a/vllm/distributed/__init__.py +++ b/vllm/distributed/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from .communication_op import * from .parallel_state import * diff --git a/vllm/distributed/communication_op.py b/vllm/distributed/communication_op.py index d85a41ddac221..0a5a95176f7c3 100644 --- a/vllm/distributed/communication_op.py +++ b/vllm/distributed/communication_op.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any, Optional, Union diff --git a/vllm/distributed/device_communicators/all2all.py b/vllm/distributed/device_communicators/all2all.py index 7177754a37115..ae75902994423 100644 --- a/vllm/distributed/device_communicators/all2all.py +++ b/vllm/distributed/device_communicators/all2all.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import importlib.util from typing import TYPE_CHECKING diff --git a/vllm/distributed/device_communicators/base_device_communicator.py b/vllm/distributed/device_communicators/base_device_communicator.py index 52b970949144f..38370d4dc2b51 100644 --- a/vllm/distributed/device_communicators/base_device_communicator.py +++ b/vllm/distributed/device_communicators/base_device_communicator.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import threading from typing import Optional from weakref import WeakValueDictionary diff --git a/vllm/distributed/device_communicators/cpu_communicator.py b/vllm/distributed/device_communicators/cpu_communicator.py index c04218cb9f394..94effa0b2ca88 100644 --- a/vllm/distributed/device_communicators/cpu_communicator.py +++ b/vllm/distributed/device_communicators/cpu_communicator.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os from typing import Optional diff --git a/vllm/distributed/device_communicators/cuda_communicator.py b/vllm/distributed/device_communicators/cuda_communicator.py index a05a13f51d4bc..0eebdf8736ce2 100644 --- a/vllm/distributed/device_communicators/cuda_communicator.py +++ b/vllm/distributed/device_communicators/cuda_communicator.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/vllm/distributed/device_communicators/cuda_wrapper.py b/vllm/distributed/device_communicators/cuda_wrapper.py index 6c15ef644b8c2..2c38e8ed21d7d 100644 --- a/vllm/distributed/device_communicators/cuda_wrapper.py +++ b/vllm/distributed/device_communicators/cuda_wrapper.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """This file is a pure Python wrapper for the cudart library. It avoids the need to compile a separate shared library, and is convenient for use when we just need to call a few functions. diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py index 5c2dbcc27b13c..7dd104a4fcc4e 100644 --- a/vllm/distributed/device_communicators/custom_all_reduce.py +++ b/vllm/distributed/device_communicators/custom_all_reduce.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from contextlib import contextmanager from typing import Optional, Union diff --git a/vllm/distributed/device_communicators/custom_all_reduce_utils.py b/vllm/distributed/device_communicators/custom_all_reduce_utils.py index 11b8b57fe2aed..7c6001e870392 100644 --- a/vllm/distributed/device_communicators/custom_all_reduce_utils.py +++ b/vllm/distributed/device_communicators/custom_all_reduce_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import ctypes import json diff --git a/vllm/distributed/device_communicators/hpu_communicator.py b/vllm/distributed/device_communicators/hpu_communicator.py index 9536a7f883e1b..f00f6b62bf24a 100644 --- a/vllm/distributed/device_communicators/hpu_communicator.py +++ b/vllm/distributed/device_communicators/hpu_communicator.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch import torch.distributed as dist diff --git a/vllm/distributed/device_communicators/neuron_communicator.py b/vllm/distributed/device_communicators/neuron_communicator.py index dfa4b5194bdbe..5b61a1687a016 100644 --- a/vllm/distributed/device_communicators/neuron_communicator.py +++ b/vllm/distributed/device_communicators/neuron_communicator.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch from vllm.distributed.device_communicators.base_device_communicator import ( diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py index 0ccd423121cb0..29486292996ad 100644 --- a/vllm/distributed/device_communicators/pynccl.py +++ b/vllm/distributed/device_communicators/pynccl.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional, Union diff --git a/vllm/distributed/device_communicators/pynccl_wrapper.py b/vllm/distributed/device_communicators/pynccl_wrapper.py index 6f69089b61968..04a4d0147f5d8 100644 --- a/vllm/distributed/device_communicators/pynccl_wrapper.py +++ b/vllm/distributed/device_communicators/pynccl_wrapper.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # This file is a pure Python wrapper for the NCCL library. # The main purpose is to use NCCL combined with CUDA graph. diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py index 40e57e6624d1e..0f66f0aebd7f6 100644 --- a/vllm/distributed/device_communicators/shm_broadcast.py +++ b/vllm/distributed/device_communicators/shm_broadcast.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pickle import time diff --git a/vllm/distributed/device_communicators/tpu_communicator.py b/vllm/distributed/device_communicators/tpu_communicator.py index a1775279661d1..c60a7a7eb25cf 100644 --- a/vllm/distributed/device_communicators/tpu_communicator.py +++ b/vllm/distributed/device_communicators/tpu_communicator.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os from typing import Optional diff --git a/vllm/distributed/device_communicators/xpu_communicator.py b/vllm/distributed/device_communicators/xpu_communicator.py index 256e7965e0a72..216ff85c8bb7e 100644 --- a/vllm/distributed/device_communicators/xpu_communicator.py +++ b/vllm/distributed/device_communicators/xpu_communicator.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/vllm/distributed/kv_events.py b/vllm/distributed/kv_events.py index 29c6a70c4d26f..9bf1c058a1915 100644 --- a/vllm/distributed/kv_events.py +++ b/vllm/distributed/kv_events.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import queue import threading diff --git a/vllm/distributed/kv_transfer/__init__.py b/vllm/distributed/kv_transfer/__init__.py index 8b6abf5a80dd0..fa9b7e4f14c02 100644 --- a/vllm/distributed/kv_transfer/__init__.py +++ b/vllm/distributed/kv_transfer/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm.distributed.kv_transfer.kv_transfer_state import ( KVConnectorBaseType, ensure_kv_transfer_initialized, get_kv_transfer_group, diff --git a/vllm/distributed/kv_transfer/kv_connector/base.py b/vllm/distributed/kv_transfer/kv_connector/base.py index e9b70610e8cdf..181c33925da76 100644 --- a/vllm/distributed/kv_transfer/kv_connector/base.py +++ b/vllm/distributed/kv_transfer/kv_connector/base.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ KVConnectorBase Class for Distributed KV Cache & Hidden State communication diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py index dce0b545c188e..58dfa251c735d 100644 --- a/vllm/distributed/kv_transfer/kv_connector/factory.py +++ b/vllm/distributed/kv_transfer/kv_connector/factory.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import importlib from typing import TYPE_CHECKING, Callable diff --git a/vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py b/vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py index d121cb701bef3..78bf3095613a7 100644 --- a/vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ LMCache KV Cache Connector for Distributed Machine Learning Inference diff --git a/vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py b/vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py index 58eabd0a37ebb..94a7ce91acf17 100644 --- a/vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ MooncakeStore Connector for Distributed Machine Learning Inference The MooncakeStoreConnector transfers KV caches between prefill vLLM workers diff --git a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py index ed8fe38161e97..e7c079e1f115c 100644 --- a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Simple KV Cache Connector for Distributed Machine Learning Inference diff --git a/vllm/distributed/kv_transfer/kv_connector/utils.py b/vllm/distributed/kv_transfer/kv_connector/utils.py index b1c9c9af6e235..c62444e756cfc 100644 --- a/vllm/distributed/kv_transfer/kv_connector/utils.py +++ b/vllm/distributed/kv_transfer/kv_connector/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ KV cache helper for store. """ diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/__init__.py b/vllm/distributed/kv_transfer/kv_connector/v1/__init__.py index e66aaa7f8af8e..f00f31dde915a 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/__init__.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm.distributed.kv_transfer.kv_connector.v1.base import ( KVConnectorBase_V1, KVConnectorRole) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py index bc9258e9d07b6..8f9d70eec038b 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ KVConnectorBase_V1 Class for Distributed KV Cache & Hidden State communication in vLLM v1 diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py index 2cb68dc1ff675..cc1f4ba356428 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import TYPE_CHECKING import torch diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py index 0aabb260fd3dc..5aab10b2b1ad8 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import copy from dataclasses import dataclass from typing import TYPE_CHECKING, Any, Optional diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index 4d228dbc9d492..3f0b0e2952196 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import contextlib import math import threading diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py index 0421a65a2c819..f86b92692a0e5 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import hashlib import os from dataclasses import dataclass diff --git a/vllm/distributed/kv_transfer/kv_connector_agent.py b/vllm/distributed/kv_transfer/kv_connector_agent.py index 819c06805ee47..8633fdaf59f8b 100644 --- a/vllm/distributed/kv_transfer/kv_connector_agent.py +++ b/vllm/distributed/kv_transfer/kv_connector_agent.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """A centralized entrypoint to perform distributed KV cache transfer. This implementation is a shim wrapper on two APIs exposed by `kv_connector`: diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py index d1ffb8092dfc9..eef14269f1961 100644 --- a/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py +++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This file contains a new class `KVLookupBufferBase` that allows developers to think of KV cache operations as inserting new KV cache entries (`insert`) diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py index 5bb7110216768..4381aad1e9956 100644 --- a/vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py +++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This file contains a new class `MooncakeStore` that allows developers to think of KV cache transfer operations as putting new KV cache entries diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py index e3b2274bd8a41..a0ff7c320f61e 100644 --- a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py +++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Implements a distributed key-value (KV) cache transfer mechanism. diff --git a/vllm/distributed/kv_transfer/kv_pipe/base.py b/vllm/distributed/kv_transfer/kv_pipe/base.py index 40589fb3ef872..1423fd032477e 100644 --- a/vllm/distributed/kv_transfer/kv_pipe/base.py +++ b/vllm/distributed/kv_transfer/kv_pipe/base.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This file defines an interface `KVPipeBase` that provides an abstraction for sending and receiving tensors, or None, via diff --git a/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py index aa4b1ba71492c..9f3494b8106e2 100644 --- a/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py +++ b/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json import os diff --git a/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py index 761c56f7e41f5..09de0b682efca 100644 --- a/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py +++ b/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This module implements a PyNccl pipe for sending and receiving Optional[torch.Tensor] between distributed ranks with advanced diff --git a/vllm/distributed/kv_transfer/kv_transfer_state.py b/vllm/distributed/kv_transfer/kv_transfer_state.py index 25d2f2cf5c6e6..60f1d5d8bca75 100644 --- a/vllm/distributed/kv_transfer/kv_transfer_state.py +++ b/vllm/distributed/kv_transfer/kv_transfer_state.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import TYPE_CHECKING, Optional from vllm import envs diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 32c9301bf23d3..10f87c49baa9e 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright 2023 The vLLM team. # Adapted from diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py index 96d08dc1a3c18..67f71643d039c 100644 --- a/vllm/distributed/utils.py +++ b/vllm/distributed/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright 2023 The vLLM team. # Adapted from diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index e3b8a18ccdfef..587a23134fe90 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # yapf: disable import argparse diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 19b219b674f38..6d8d97cf5feba 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio import copy diff --git a/vllm/engine/async_timeout.py b/vllm/engine/async_timeout.py index 94674262bcfe3..28a023a71ef52 100644 --- a/vllm/engine/async_timeout.py +++ b/vllm/engine/async_timeout.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Workaround for https://github.com/python/cpython/issues/86296 # diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index a9600a2c8aa3d..dbcf78f023611 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import copy import time diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index 916afe0c8e5f7..8d51f0472351b 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import time from typing import TYPE_CHECKING diff --git a/vllm/engine/metrics_types.py b/vllm/engine/metrics_types.py index acc83011d6c8e..9375dc4c495ba 100644 --- a/vllm/engine/metrics_types.py +++ b/vllm/engine/metrics_types.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ These types are defined in this file to avoid importing vllm.engine.metrics and therefore importing prometheus_client. diff --git a/vllm/engine/multiprocessing/__init__.py b/vllm/engine/multiprocessing/__init__.py index af72c8e6b7766..bf9f669031cb0 100644 --- a/vllm/engine/multiprocessing/__init__.py +++ b/vllm/engine/multiprocessing/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import uuid from dataclasses import dataclass, field diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py index 18b7c187bdffe..f2f4424859331 100644 --- a/vllm/engine/multiprocessing/client.py +++ b/vllm/engine/multiprocessing/client.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio import copy diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py index 434cb49855621..ef088bd3933af 100644 --- a/vllm/engine/multiprocessing/engine.py +++ b/vllm/engine/multiprocessing/engine.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pickle import signal diff --git a/vllm/engine/output_processor/interfaces.py b/vllm/engine/output_processor/interfaces.py index 4c8e295c13815..19c5963d32dbb 100644 --- a/vllm/engine/output_processor/interfaces.py +++ b/vllm/engine/output_processor/interfaces.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import ABC, abstractmethod from typing import Callable, List diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py index 110f84a65efc9..e0fa6a00ecfa4 100644 --- a/vllm/engine/output_processor/multi_step.py +++ b/vllm/engine/output_processor/multi_step.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import functools from typing import Callable, List, cast diff --git a/vllm/engine/output_processor/single_step.py b/vllm/engine/output_processor/single_step.py index e88f119c87426..dbf6a371d050a 100644 --- a/vllm/engine/output_processor/single_step.py +++ b/vllm/engine/output_processor/single_step.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import List diff --git a/vllm/engine/output_processor/stop_checker.py b/vllm/engine/output_processor/stop_checker.py index 6cad9ec8f327f..7925d91f60640 100644 --- a/vllm/engine/output_processor/stop_checker.py +++ b/vllm/engine/output_processor/stop_checker.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Callable, List, Optional, Tuple diff --git a/vllm/engine/output_processor/util.py b/vllm/engine/output_processor/util.py index 0d2b58c109e32..1e127eb982425 100644 --- a/vllm/engine/output_processor/util.py +++ b/vllm/engine/output_processor/util.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import List from typing import Sequence as GenericSequence diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py index 28341c2c633e8..727d59283643c 100644 --- a/vllm/engine/protocol.py +++ b/vllm/engine/protocol.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio from abc import ABC, abstractmethod diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py index 1c027181156f1..56f8754c266bb 100644 --- a/vllm/entrypoints/api_server.py +++ b/vllm/entrypoints/api_server.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ NOTE: This API server is used only for demonstrating usage of AsyncEngine and simple performance benchmarks. It is not intended for production use. diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index b051cd3338a4c..95c806c228b82 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio import json diff --git a/vllm/entrypoints/cli/benchmark/base.py b/vllm/entrypoints/cli/benchmark/base.py index 94fb415f581f4..30a8844108002 100644 --- a/vllm/entrypoints/cli/benchmark/base.py +++ b/vllm/entrypoints/cli/benchmark/base.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse from vllm.entrypoints.cli.types import CLISubcommand diff --git a/vllm/entrypoints/cli/benchmark/latency.py b/vllm/entrypoints/cli/benchmark/latency.py index 5aca16e0b640c..e0358a262dcdc 100644 --- a/vllm/entrypoints/cli/benchmark/latency.py +++ b/vllm/entrypoints/cli/benchmark/latency.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse from vllm.benchmarks.latency import add_cli_args, main diff --git a/vllm/entrypoints/cli/benchmark/main.py b/vllm/entrypoints/cli/benchmark/main.py index 9e857af7d6dbd..717da630ab4f0 100644 --- a/vllm/entrypoints/cli/benchmark/main.py +++ b/vllm/entrypoints/cli/benchmark/main.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import vllm.entrypoints.cli.benchmark.latency diff --git a/vllm/entrypoints/cli/benchmark/serve.py b/vllm/entrypoints/cli/benchmark/serve.py index d5a858920ebdb..3043701570230 100644 --- a/vllm/entrypoints/cli/benchmark/serve.py +++ b/vllm/entrypoints/cli/benchmark/serve.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse from vllm.benchmarks.serve import add_cli_args, main diff --git a/vllm/entrypoints/cli/benchmark/throughput.py b/vllm/entrypoints/cli/benchmark/throughput.py index 88ee6aa038578..20431cd3d8702 100644 --- a/vllm/entrypoints/cli/benchmark/throughput.py +++ b/vllm/entrypoints/cli/benchmark/throughput.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse from vllm.benchmarks.throughput import add_cli_args, main diff --git a/vllm/entrypoints/cli/collect_env.py b/vllm/entrypoints/cli/collect_env.py index 810ecfdf71c32..141aafdb1a618 100644 --- a/vllm/entrypoints/cli/collect_env.py +++ b/vllm/entrypoints/cli/collect_env.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse diff --git a/vllm/entrypoints/cli/main.py b/vllm/entrypoints/cli/main.py index 5eba72fec13cc..3e834b3b29647 100644 --- a/vllm/entrypoints/cli/main.py +++ b/vllm/entrypoints/cli/main.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # The CLI entrypoint to vLLM. import signal diff --git a/vllm/entrypoints/cli/openai.py b/vllm/entrypoints/cli/openai.py index 215fcf3c3e44e..58dcdfe217fd5 100644 --- a/vllm/entrypoints/cli/openai.py +++ b/vllm/entrypoints/cli/openai.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Commands that act as an interactive OpenAI API client import argparse diff --git a/vllm/entrypoints/cli/run_batch.py b/vllm/entrypoints/cli/run_batch.py index f74c8da9b9b86..353034f881f7d 100644 --- a/vllm/entrypoints/cli/run_batch.py +++ b/vllm/entrypoints/cli/run_batch.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import asyncio diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py index 040ae166a2d5f..f9c56e6554617 100644 --- a/vllm/entrypoints/cli/serve.py +++ b/vllm/entrypoints/cli/serve.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import os diff --git a/vllm/entrypoints/cli/types.py b/vllm/entrypoints/cli/types.py index f739a68c5f4c9..0a72443129758 100644 --- a/vllm/entrypoints/cli/types.py +++ b/vllm/entrypoints/cli/types.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse diff --git a/vllm/entrypoints/launcher.py b/vllm/entrypoints/launcher.py index a4f70a51ebaf3..9f4dc19fb4ab7 100644 --- a/vllm/entrypoints/launcher.py +++ b/vllm/entrypoints/launcher.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio import signal diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index e05189ef49611..fd28bf39e2d56 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import itertools import warnings diff --git a/vllm/entrypoints/logger.py b/vllm/entrypoints/logger.py index d4655dd5e6ab8..f3aee188dae94 100644 --- a/vllm/entrypoints/logger.py +++ b/vllm/entrypoints/logger.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional, Union diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 5a4295ff716db..2f8819bca60da 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio import atexit diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py index f196ff6ed3021..ca70e78df3260 100644 --- a/vllm/entrypoints/openai/cli_args.py +++ b/vllm/entrypoints/openai/cli_args.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This file contains the command line arguments for the vLLM's OpenAI-compatible server. It is kept in a separate file for documentation diff --git a/vllm/entrypoints/openai/logits_processors.py b/vllm/entrypoints/openai/logits_processors.py index 04d5091a96811..29d72256cf70b 100644 --- a/vllm/entrypoints/openai/logits_processors.py +++ b/vllm/entrypoints/openai/logits_processors.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable from functools import lru_cache, partial diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index e72c23993ac8c..ecfcc00687ad8 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/protocol/openai_api_protocol.py diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py index ac250b3cb4fbf..9994b3cae8888 100644 --- a/vllm/entrypoints/openai/run_batch.py +++ b/vllm/entrypoints/openai/run_batch.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio import tempfile diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index ea8e187dc6b7f..7e514d660be41 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio import json diff --git a/vllm/entrypoints/openai/serving_classification.py b/vllm/entrypoints/openai/serving_classification.py index 90cdd389d59f0..3ac4f01ea6028 100644 --- a/vllm/entrypoints/openai/serving_classification.py +++ b/vllm/entrypoints/openai/serving_classification.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from http import HTTPStatus from typing import Optional, Union, cast diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 1c06070cb3154..ce5eca8550289 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio import time diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py index 3785d2642f9d9..e87decfe636ac 100644 --- a/vllm/entrypoints/openai/serving_embedding.py +++ b/vllm/entrypoints/openai/serving_embedding.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import base64 from typing import Final, Literal, Optional, Union, cast diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index f96a4ac8b3a51..ac3883bdeb33c 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import base64 import io import json diff --git a/vllm/entrypoints/openai/serving_models.py b/vllm/entrypoints/openai/serving_models.py index 74433a1a3c3f5..764b0e73690de 100644 --- a/vllm/entrypoints/openai/serving_models.py +++ b/vllm/entrypoints/openai/serving_models.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json import pathlib diff --git a/vllm/entrypoints/openai/serving_pooling.py b/vllm/entrypoints/openai/serving_pooling.py index 7c401d4f5cb14..b896cc46b9d08 100644 --- a/vllm/entrypoints/openai/serving_pooling.py +++ b/vllm/entrypoints/openai/serving_pooling.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio import base64 diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py index 9bdacb5518d6a..f58611c49b88c 100644 --- a/vllm/entrypoints/openai/serving_score.py +++ b/vllm/entrypoints/openai/serving_score.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio import time from collections.abc import AsyncGenerator, Mapping diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py index 0d739bbf9bf22..3db0a71fadd15 100644 --- a/vllm/entrypoints/openai/serving_tokenization.py +++ b/vllm/entrypoints/openai/serving_tokenization.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Final, Optional, Union diff --git a/vllm/entrypoints/openai/serving_transcription.py b/vllm/entrypoints/openai/serving_transcription.py index 9fc5b562e7d5c..f667c7e9b3a96 100644 --- a/vllm/entrypoints/openai/serving_transcription.py +++ b/vllm/entrypoints/openai/serving_transcription.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio import io import time diff --git a/vllm/entrypoints/openai/tool_parsers/__init__.py b/vllm/entrypoints/openai/tool_parsers/__init__.py index 054c0b006b2fc..3e4f4e149c9f4 100644 --- a/vllm/entrypoints/openai/tool_parsers/__init__.py +++ b/vllm/entrypoints/openai/tool_parsers/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from .abstract_tool_parser import ToolParser, ToolParserManager from .deepseekv3_tool_parser import DeepSeekV3ToolParser diff --git a/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py index 931d5aab9bd9d..02aeab6136316 100644 --- a/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os from collections.abc import Sequence diff --git a/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py index 14e743e13a727..60025af2a6f33 100644 --- a/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Sequence from typing import Union diff --git a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py index 383e0d44de99f..5508ba6a39408 100644 --- a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json from collections.abc import Sequence diff --git a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py index b8bf142530ee3..fcc5b7edda83f 100644 --- a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json from collections.abc import Sequence diff --git a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py index 2b9f9852bcb32..c7030d34d453e 100644 --- a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json from collections.abc import Sequence diff --git a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py index 3f2799f8010a5..e5dcdf9a07602 100644 --- a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json from collections.abc import Sequence diff --git a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py index 2714a545f997f..66b483d8b0f66 100644 --- a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json from collections.abc import Sequence diff --git a/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py index 323fb144181ea..6bf44a4345a9d 100644 --- a/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import ast import json from collections.abc import Sequence diff --git a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py index 4eda7044cbbaf..5698bc70af23b 100644 --- a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json from collections.abc import Sequence diff --git a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py index fecad7e653abc..ef5b14f3cd280 100644 --- a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json from collections.abc import Sequence diff --git a/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py index 00690ad79a7ac..5501028cf36b8 100644 --- a/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json from collections.abc import Sequence diff --git a/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py index bc5d15dcb82f4..73329cdf701d6 100644 --- a/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import ast import json diff --git a/vllm/entrypoints/openai/tool_parsers/utils.py b/vllm/entrypoints/openai/tool_parsers/utils.py index acbff3258e465..aa41cd6dc53ed 100644 --- a/vllm/entrypoints/openai/tool_parsers/utils.py +++ b/vllm/entrypoints/openai/tool_parsers/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json from json import JSONDecodeError, JSONDecoder diff --git a/vllm/entrypoints/score_utils.py b/vllm/entrypoints/score_utils.py index 80b6c07c603f9..c4e044f3a28e9 100644 --- a/vllm/entrypoints/score_utils.py +++ b/vllm/entrypoints/score_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Union from torch.nn import CosineSimilarity diff --git a/vllm/entrypoints/ssl.py b/vllm/entrypoints/ssl.py index dba916b8bf13f..e3646a60a7cc1 100644 --- a/vllm/entrypoints/ssl.py +++ b/vllm/entrypoints/ssl.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio from ssl import SSLContext diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/utils.py index 1b0ea69096cc6..6fb32ff187cc6 100644 --- a/vllm/entrypoints/utils.py +++ b/vllm/entrypoints/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio import functools diff --git a/vllm/env_override.py b/vllm/env_override.py index 71f031d1e2313..b0a061d2c4ed9 100644 --- a/vllm/env_override.py +++ b/vllm/env_override.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os import torch diff --git a/vllm/envs.py b/vllm/envs.py index 3dd0d9045372f..2e3d6eeb57e8a 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import hashlib import os diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py index 40ca1d29939af..99e12201c96af 100644 --- a/vllm/executor/executor_base.py +++ b/vllm/executor/executor_base.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio import time diff --git a/vllm/executor/mp_distributed_executor.py b/vllm/executor/mp_distributed_executor.py index d1f8c36fbbec7..4e8c6d79095f9 100644 --- a/vllm/executor/mp_distributed_executor.py +++ b/vllm/executor/mp_distributed_executor.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio import os diff --git a/vllm/executor/msgspec_utils.py b/vllm/executor/msgspec_utils.py index e680d53cbd10e..852c8f5cffa0c 100644 --- a/vllm/executor/msgspec_utils.py +++ b/vllm/executor/msgspec_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from array import array from typing import Any, Type diff --git a/vllm/executor/multiproc_worker_utils.py b/vllm/executor/multiproc_worker_utils.py index 380b672c3605a..a6c172beff7bb 100644 --- a/vllm/executor/multiproc_worker_utils.py +++ b/vllm/executor/multiproc_worker_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio import os diff --git a/vllm/executor/ray_distributed_executor.py b/vllm/executor/ray_distributed_executor.py index 8e67c7a41bb19..bdc2b1f4c27cd 100644 --- a/vllm/executor/ray_distributed_executor.py +++ b/vllm/executor/ray_distributed_executor.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio import json diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py index 7bc98a16f041d..c222f1609096c 100644 --- a/vllm/executor/ray_utils.py +++ b/vllm/executor/ray_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os import time diff --git a/vllm/executor/uniproc_executor.py b/vllm/executor/uniproc_executor.py index 1d3a6e443a80e..7ebeb4a22556f 100644 --- a/vllm/executor/uniproc_executor.py +++ b/vllm/executor/uniproc_executor.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os from typing import Any, Callable, Dict, List, Optional, Tuple, Union diff --git a/vllm/forward_context.py b/vllm/forward_context.py index f192be1c40d54..f3b0518a44e03 100644 --- a/vllm/forward_context.py +++ b/vllm/forward_context.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import time from collections import defaultdict diff --git a/vllm/inputs/__init__.py b/vllm/inputs/__init__.py index df4f844cd815e..37bf2b7a44366 100644 --- a/vllm/inputs/__init__.py +++ b/vllm/inputs/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from .data import (DecoderOnlyInputs, EmbedsInputs, EncoderDecoderInputs, ExplicitEncoderDecoderPrompt, ProcessorInputs, PromptType, diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py index 843c45bd6163e..23cb5e5022f19 100644 --- a/vllm/inputs/data.py +++ b/vllm/inputs/data.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable from typing import TYPE_CHECKING, Any, Generic, Literal, Optional, Union, cast diff --git a/vllm/inputs/parse.py b/vllm/inputs/parse.py index 4c64a41ace310..8c3700799e4ab 100644 --- a/vllm/inputs/parse.py +++ b/vllm/inputs/parse.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Sequence from typing import Literal, Optional, TypedDict, Union, cast, overload diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index b9acabeabd8df..a13e563f34a14 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio from collections.abc import Mapping diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index f424a8f613ab1..73d19aecde6c5 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Mapping from dataclasses import dataclass from typing import TYPE_CHECKING, Any, NamedTuple, Optional, Union diff --git a/vllm/jsontree.py b/vllm/jsontree.py index 91cd7cb216d77..4cbe0f76e0067 100644 --- a/vllm/jsontree.py +++ b/vllm/jsontree.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Helper functions to work with nested JSON structures.""" from collections.abc import Iterable from functools import reduce diff --git a/vllm/logger.py b/vllm/logger.py index fd16dd95bb1b3..0ddb83cb8ba7a 100644 --- a/vllm/logger.py +++ b/vllm/logger.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Logging configuration for vLLM.""" import datetime import json diff --git a/vllm/logging_utils/__init__.py b/vllm/logging_utils/__init__.py index 7ab4632589bf4..cf690a89ae9bc 100644 --- a/vllm/logging_utils/__init__.py +++ b/vllm/logging_utils/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm.logging_utils.formatter import NewLineFormatter diff --git a/vllm/logging_utils/dump_input.py b/vllm/logging_utils/dump_input.py index 47ce0ab188bd6..d14515f56e54c 100644 --- a/vllm/logging_utils/dump_input.py +++ b/vllm/logging_utils/dump_input.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import contextlib import enum diff --git a/vllm/logging_utils/formatter.py b/vllm/logging_utils/formatter.py index 010b0a124987b..0affef10078dc 100644 --- a/vllm/logging_utils/formatter.py +++ b/vllm/logging_utils/formatter.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import logging diff --git a/vllm/logits_process.py b/vllm/logits_process.py index 29a73656bf65e..5967d0836bd45 100644 --- a/vllm/logits_process.py +++ b/vllm/logits_process.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Callable, Union diff --git a/vllm/lora/fully_sharded_layers.py b/vllm/lora/fully_sharded_layers.py index b6b138a44051f..7fc4cfe026aee 100644 --- a/vllm/lora/fully_sharded_layers.py +++ b/vllm/lora/fully_sharded_layers.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # pylint: disable=unused-argument from typing import TYPE_CHECKING, Optional, Union, cast diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index 023c8e9c9a864..66e037a97d063 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # pylint: disable=unused-argument import math diff --git a/vllm/lora/lora.py b/vllm/lora/lora.py index 294b49e0a8997..958364fca592f 100644 --- a/vllm/lora/lora.py +++ b/vllm/lora/lora.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Sequence as GenericSequence from typing import Optional diff --git a/vllm/lora/models.py b/vllm/lora/models.py index dfdc908d7e05b..262e6799583ae 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import math import os diff --git a/vllm/lora/ops/torch_ops/__init__.py b/vllm/lora/ops/torch_ops/__init__.py index 85601d58c9d73..22aa3c63dce19 100644 --- a/vllm/lora/ops/torch_ops/__init__.py +++ b/vllm/lora/ops/torch_ops/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm.lora.ops.torch_ops.lora_ops import bgmv_expand # noqa: F401 from vllm.lora.ops.torch_ops.lora_ops import (bgmv_expand_slice, bgmv_shrink, diff --git a/vllm/lora/ops/torch_ops/lora_ops.py b/vllm/lora/ops/torch_ops/lora_ops.py index ab65faceb2c10..cba5baad86686 100644 --- a/vllm/lora/ops/torch_ops/lora_ops.py +++ b/vllm/lora/ops/torch_ops/lora_ops.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch diff --git a/vllm/lora/ops/triton_ops/__init__.py b/vllm/lora/ops/triton_ops/__init__.py index 5a39705e85712..805de4b6f6570 100644 --- a/vllm/lora/ops/triton_ops/__init__.py +++ b/vllm/lora/ops/triton_ops/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm.lora.ops.triton_ops.lora_expand_op import lora_expand from vllm.lora.ops.triton_ops.lora_kernel_metadata import LoRAKernelMeta diff --git a/vllm/lora/ops/triton_ops/kernel_utils.py b/vllm/lora/ops/triton_ops/kernel_utils.py index 0f971c03592d1..e93064d0c83ad 100644 --- a/vllm/lora/ops/triton_ops/kernel_utils.py +++ b/vllm/lora/ops/triton_ops/kernel_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Utilities for Punica kernel construction. """ diff --git a/vllm/lora/ops/triton_ops/lora_expand_op.py b/vllm/lora/ops/triton_ops/lora_expand_op.py index 9feb9e4624591..9e1f90e757cde 100644 --- a/vllm/lora/ops/triton_ops/lora_expand_op.py +++ b/vllm/lora/ops/triton_ops/lora_expand_op.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Based on: Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). diff --git a/vllm/lora/ops/triton_ops/lora_kernel_metadata.py b/vllm/lora/ops/triton_ops/lora_kernel_metadata.py index ac459a83220c7..39e647b9b88a4 100644 --- a/vllm/lora/ops/triton_ops/lora_kernel_metadata.py +++ b/vllm/lora/ops/triton_ops/lora_kernel_metadata.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ LoRA kernels metadata preparation utilities. """ diff --git a/vllm/lora/ops/triton_ops/lora_shrink_op.py b/vllm/lora/ops/triton_ops/lora_shrink_op.py index c3871bd58ffa1..3f9edfc6d655c 100644 --- a/vllm/lora/ops/triton_ops/lora_shrink_op.py +++ b/vllm/lora/ops/triton_ops/lora_shrink_op.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Based on: Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). diff --git a/vllm/lora/ops/triton_ops/utils.py b/vllm/lora/ops/triton_ops/utils.py index 6225635c2955f..5857f7fecb5b4 100644 --- a/vllm/lora/ops/triton_ops/utils.py +++ b/vllm/lora/ops/triton_ops/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch diff --git a/vllm/lora/ops/xla_ops/__init__.py b/vllm/lora/ops/xla_ops/__init__.py index 94062b05d9161..7e7c3c892457a 100644 --- a/vllm/lora/ops/xla_ops/__init__.py +++ b/vllm/lora/ops/xla_ops/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm.lora.ops.xla_ops.lora_ops import (bgmv_expand, bgmv_expand_slice, bgmv_shrink) diff --git a/vllm/lora/ops/xla_ops/lora_ops.py b/vllm/lora/ops/xla_ops/lora_ops.py index dff4d5181efe2..9118f3351ef0a 100644 --- a/vllm/lora/ops/xla_ops/lora_ops.py +++ b/vllm/lora/ops/xla_ops/lora_ops.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import jax import jax.numpy as jnp diff --git a/vllm/lora/peft_helper.py b/vllm/lora/peft_helper.py index 7d335e5f7fab1..a20d73f0f725b 100644 --- a/vllm/lora/peft_helper.py +++ b/vllm/lora/peft_helper.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from: https://github.com/huggingface/peft/blob/main/src/peft/tuners/lora/config.py diff --git a/vllm/lora/punica_wrapper/__init__.py b/vllm/lora/punica_wrapper/__init__.py index 915fc6623398e..e664ffa1dfe6e 100644 --- a/vllm/lora/punica_wrapper/__init__.py +++ b/vllm/lora/punica_wrapper/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm.lora.punica_wrapper.punica_base import PunicaWrapperBase from vllm.lora.punica_wrapper.punica_selector import get_punica_wrapper diff --git a/vllm/lora/punica_wrapper/punica_base.py b/vllm/lora/punica_wrapper/punica_base.py index e03f7329021b3..5b4902dcbeb35 100644 --- a/vllm/lora/punica_wrapper/punica_base.py +++ b/vllm/lora/punica_wrapper/punica_base.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Based on: Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). diff --git a/vllm/lora/punica_wrapper/punica_cpu.py b/vllm/lora/punica_wrapper/punica_cpu.py index 8118a72d696a2..59049cccc8cbe 100644 --- a/vllm/lora/punica_wrapper/punica_cpu.py +++ b/vllm/lora/punica_wrapper/punica_cpu.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Callable, Optional, Union diff --git a/vllm/lora/punica_wrapper/punica_gpu.py b/vllm/lora/punica_wrapper/punica_gpu.py index 224640ec71925..6b038309d55db 100644 --- a/vllm/lora/punica_wrapper/punica_gpu.py +++ b/vllm/lora/punica_wrapper/punica_gpu.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Based on: Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). diff --git a/vllm/lora/punica_wrapper/punica_hpu.py b/vllm/lora/punica_wrapper/punica_hpu.py index 416c23e73bf85..b20c9785a74c1 100644 --- a/vllm/lora/punica_wrapper/punica_hpu.py +++ b/vllm/lora/punica_wrapper/punica_hpu.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import TYPE_CHECKING, Optional, Union, final diff --git a/vllm/lora/punica_wrapper/punica_selector.py b/vllm/lora/punica_wrapper/punica_selector.py index 922d6c0600037..c684ac77cc9ca 100644 --- a/vllm/lora/punica_wrapper/punica_selector.py +++ b/vllm/lora/punica_wrapper/punica_selector.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm.logger import init_logger from vllm.platforms import current_platform diff --git a/vllm/lora/punica_wrapper/punica_tpu.py b/vllm/lora/punica_wrapper/punica_tpu.py index 0556e583f409a..6b48268c5006e 100644 --- a/vllm/lora/punica_wrapper/punica_tpu.py +++ b/vllm/lora/punica_wrapper/punica_tpu.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import math from typing import TYPE_CHECKING, Optional, Union diff --git a/vllm/lora/punica_wrapper/utils.py b/vllm/lora/punica_wrapper/utils.py index 1adb40b4c284b..0b0a7989f3907 100644 --- a/vllm/lora/punica_wrapper/utils.py +++ b/vllm/lora/punica_wrapper/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import TYPE_CHECKING, Optional, Union diff --git a/vllm/lora/request.py b/vllm/lora/request.py index 616e94f8d678f..5bbba7830c1b1 100644 --- a/vllm/lora/request.py +++ b/vllm/lora/request.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import warnings from typing import Optional diff --git a/vllm/lora/resolver.py b/vllm/lora/resolver.py index 33f35322fe85f..5808ae105e864 100644 --- a/vllm/lora/resolver.py +++ b/vllm/lora/resolver.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import ABC, abstractmethod from collections.abc import Set diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py index 619dd3bdc40af..ee196e3f689a2 100644 --- a/vllm/lora/utils.py +++ b/vllm/lora/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os from typing import Optional, Union diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py index f1ae030975074..7da44569f4086 100644 --- a/vllm/lora/worker_manager.py +++ b/vllm/lora/worker_manager.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from contextlib import contextmanager from typing import Any, Literal, Optional, Union diff --git a/vllm/model_executor/__init__.py b/vllm/model_executor/__init__.py index 7636152176f13..55dfe8088c8f3 100644 --- a/vllm/model_executor/__init__.py +++ b/vllm/model_executor/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm.model_executor.parameter import (BasevLLMParameter, PackedvLLMParameter) diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py index acf7224675e4f..7e6cdd9875106 100644 --- a/vllm/model_executor/custom_op.py +++ b/vllm/model_executor/custom_op.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch.nn as nn diff --git a/vllm/model_executor/guided_decoding/__init__.py b/vllm/model_executor/guided_decoding/__init__.py index a2b61a1b19e4d..3c2998bece441 100644 --- a/vllm/model_executor/guided_decoding/__init__.py +++ b/vllm/model_executor/guided_decoding/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from __future__ import annotations diff --git a/vllm/model_executor/guided_decoding/guidance_decoding.py b/vllm/model_executor/guided_decoding/guidance_decoding.py index 58adcc3caff99..05b6a1c3239f1 100644 --- a/vllm/model_executor/guided_decoding/guidance_decoding.py +++ b/vllm/model_executor/guided_decoding/guidance_decoding.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json import llguidance diff --git a/vllm/model_executor/guided_decoding/guidance_logits_processors.py b/vllm/model_executor/guided_decoding/guidance_logits_processors.py index e17df68b4b4da..379b5eaa38a76 100644 --- a/vllm/model_executor/guided_decoding/guidance_logits_processors.py +++ b/vllm/model_executor/guided_decoding/guidance_logits_processors.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import copy import os from typing import Any diff --git a/vllm/model_executor/guided_decoding/guided_fields.py b/vllm/model_executor/guided_decoding/guided_fields.py index 316860718b77b..fa97b6dbf5115 100644 --- a/vllm/model_executor/guided_decoding/guided_fields.py +++ b/vllm/model_executor/guided_decoding/guided_fields.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass from typing import Optional, TypedDict, Union diff --git a/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py b/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py index 7eaf9e38e66a3..f9b51f4c15745 100644 --- a/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py +++ b/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from functools import lru_cache from json import loads as json_loads diff --git a/vllm/model_executor/guided_decoding/outlines_decoding.py b/vllm/model_executor/guided_decoding/outlines_decoding.py index e41af4b360e45..26c2d958e7511 100644 --- a/vllm/model_executor/guided_decoding/outlines_decoding.py +++ b/vllm/model_executor/guided_decoding/outlines_decoding.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio import concurrent.futures diff --git a/vllm/model_executor/guided_decoding/outlines_logits_processors.py b/vllm/model_executor/guided_decoding/outlines_logits_processors.py index 6986b6554c230..4ef4db7c4a399 100644 --- a/vllm/model_executor/guided_decoding/outlines_logits_processors.py +++ b/vllm/model_executor/guided_decoding/outlines_logits_processors.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright 2024- the Outlines developers # This file is adapted from diff --git a/vllm/model_executor/guided_decoding/utils.py b/vllm/model_executor/guided_decoding/utils.py index 3f77cf394d9a3..8fdfa983e120b 100644 --- a/vllm/model_executor/guided_decoding/utils.py +++ b/vllm/model_executor/guided_decoding/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import regex as re diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py index d2e5686099459..bdd3a1a9c0a59 100644 --- a/vllm/model_executor/guided_decoding/xgrammar_decoding.py +++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # noqa: UP007 from __future__ import annotations diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py index a32c26317a884..cc9c8d445ab6c 100644 --- a/vllm/model_executor/layers/activation.py +++ b/vllm/model_executor/layers/activation.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Custom activation functions.""" import math from typing import Optional diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py index 5c262287f7dd4..2bdc96e297c1f 100644 --- a/vllm/model_executor/layers/fused_moe/__init__.py +++ b/vllm/model_executor/layers/fused_moe/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from contextlib import contextmanager from typing import Any, Optional diff --git a/vllm/model_executor/layers/fused_moe/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/cutlass_moe.py index 26a433da2189a..d827869d05382 100644 --- a/vllm/model_executor/layers/fused_moe/cutlass_moe.py +++ b/vllm/model_executor/layers/fused_moe/cutlass_moe.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ CUTLASS based Fused MoE kernels.""" from typing import Optional diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py index 46a814e6ecc3c..331544d64ff83 100644 --- a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py +++ b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import functools import importlib.util from typing import Optional diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py index c2db793659312..205a95e7ff1e4 100644 --- a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Fused batched MoE kernel.""" from typing import Optional diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py index 4c84dd5383320..40b76994f412c 100644 --- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Fused MoE utilities for GPTQ.""" import functools from typing import Optional diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 78f8eb926dc83..883a48c984f21 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Fused MoE kernel.""" import functools import json diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 1e193c909f617..3ce4cbc2838e9 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import importlib from abc import abstractmethod diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py index 7d3ddf8f14c4d..5e321c9b43af7 100644 --- a/vllm/model_executor/layers/fused_moe/modular_kernel.py +++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import ABC, abstractmethod from typing import Optional diff --git a/vllm/model_executor/layers/fused_moe/moe_align_block_size.py b/vllm/model_executor/layers/fused_moe/moe_align_block_size.py index d025f1257a9f6..98e175b12ed45 100644 --- a/vllm/model_executor/layers/fused_moe/moe_align_block_size.py +++ b/vllm/model_executor/layers/fused_moe/moe_align_block_size.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional import torch diff --git a/vllm/model_executor/layers/fused_moe/moe_pallas.py b/vllm/model_executor/layers/fused_moe/moe_pallas.py index 9d8bd62c6969a..d35bd0098b3ca 100644 --- a/vllm/model_executor/layers/fused_moe/moe_pallas.py +++ b/vllm/model_executor/layers/fused_moe/moe_pallas.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch import torch.nn.functional as F diff --git a/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py b/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py index cb396f26c96e0..da78714341513 100644 --- a/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py +++ b/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional import torch diff --git a/vllm/model_executor/layers/fused_moe/moe_torch_iterative.py b/vllm/model_executor/layers/fused_moe/moe_torch_iterative.py index da27633f27239..6160da7329518 100644 --- a/vllm/model_executor/layers/fused_moe/moe_torch_iterative.py +++ b/vllm/model_executor/layers/fused_moe/moe_torch_iterative.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch import torch.nn.functional as F diff --git a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py index 783ebebbfec94..8405603cf28a0 100644 --- a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional import pplx_kernels as pplx diff --git a/vllm/model_executor/layers/fused_moe/prepare_finalize.py b/vllm/model_executor/layers/fused_moe/prepare_finalize.py index 98f98b3bd20bc..77a9686c93a63 100644 --- a/vllm/model_executor/layers/fused_moe/prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/prepare_finalize.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional import torch diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py index 824062491f0ed..d44989cce724a 100644 --- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from enum import IntEnum from functools import cache from typing import Optional diff --git a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py index 2cfe373140bb9..373e8ab396bc3 100644 --- a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py +++ b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional import torch diff --git a/vllm/model_executor/layers/fused_moe/utils.py b/vllm/model_executor/layers/fused_moe/utils.py index d9d2520e18b3b..c3a58478247a7 100644 --- a/vllm/model_executor/layers/fused_moe/utils.py +++ b/vllm/model_executor/layers/fused_moe/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from math import prod from typing import Optional diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py index e8abd32ff6ba6..b3c65e34178ad 100644 --- a/vllm/model_executor/layers/layernorm.py +++ b/vllm/model_executor/layers/layernorm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Custom normalization layers.""" from typing import Optional, Union diff --git a/vllm/model_executor/layers/lightning_attn.py b/vllm/model_executor/layers/lightning_attn.py index 96659af408ed7..978086d1909d1 100644 --- a/vllm/model_executor/layers/lightning_attn.py +++ b/vllm/model_executor/layers/lightning_attn.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch from einops import rearrange diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 269ac043d26c4..588aa8deb1832 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import itertools from abc import abstractmethod diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py index 6b69a260826b1..3d01253447c03 100644 --- a/vllm/model_executor/layers/logits_processor.py +++ b/vllm/model_executor/layers/logits_processor.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """A layer that compute logits from hidden_stats.""" import inspect from concurrent.futures import ThreadPoolExecutor diff --git a/vllm/model_executor/layers/mamba/mamba2_metadata.py b/vllm/model_executor/layers/mamba/mamba2_metadata.py index 019f634a9ef41..88053faf9e524 100644 --- a/vllm/model_executor/layers/mamba/mamba2_metadata.py +++ b/vllm/model_executor/layers/mamba/mamba2_metadata.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import math from dataclasses import dataclass diff --git a/vllm/model_executor/layers/mamba/mamba_mixer.py b/vllm/model_executor/layers/mamba/mamba_mixer.py index 156e8752e96cf..118bd8d55c1d8 100644 --- a/vllm/model_executor/layers/mamba/mamba_mixer.py +++ b/vllm/model_executor/layers/mamba/mamba_mixer.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch from torch import nn diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py index f94ab75f9a4f0..6d9ea5387879b 100644 --- a/vllm/model_executor/layers/mamba/mamba_mixer2.py +++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional, Union diff --git a/vllm/model_executor/layers/mamba/ops/causal_conv1d.py b/vllm/model_executor/layers/mamba/ops/causal_conv1d.py index 21e27160f090b..a10c5ab697874 100644 --- a/vllm/model_executor/layers/mamba/ops/causal_conv1d.py +++ b/vllm/model_executor/layers/mamba/ops/causal_conv1d.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright (c) 2024, Tri Dao. # Adapted from https://github.com/Dao-AILab/causal-conv1d/blob/main/causal_conv1d/causal_conv1d_interface.py diff --git a/vllm/model_executor/layers/mamba/ops/mamba_ssm.py b/vllm/model_executor/layers/mamba/ops/mamba_ssm.py index 689c940d11ba4..ccfb278cdff6c 100644 --- a/vllm/model_executor/layers/mamba/ops/mamba_ssm.py +++ b/vllm/model_executor/layers/mamba/ops/mamba_ssm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright (c) 2024, Tri Dao, Albert Gu. # Adapted from https://github.com/state-spaces/mamba/blob/v2.2.4/mamba_ssm/ops/triton/selective_state_update.py diff --git a/vllm/model_executor/layers/mamba/ops/ssd_bmm.py b/vllm/model_executor/layers/mamba/ops/ssd_bmm.py index 0fdb055aab82f..11ca1255ebfb6 100644 --- a/vllm/model_executor/layers/mamba/ops/ssd_bmm.py +++ b/vllm/model_executor/layers/mamba/ops/ssd_bmm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright (c) 2024, Tri Dao, Albert Gu. # Adapted from https://github.com/state-spaces/mamba/blob/v2.2.4/mamba_ssm/ops/triton/ssd_bmm.py diff --git a/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py b/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py index 1652c51814cdf..365e1c54b555a 100644 --- a/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py +++ b/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright (c) 2024, Tri Dao, Albert Gu. # Adapted from https://github.com/state-spaces/mamba/blob/v2.2.4/mamba_ssm/ops/triton/ssd_chunk_scan.py diff --git a/vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py b/vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py index ee633569097b6..58bfb661d332a 100644 --- a/vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py +++ b/vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright (c) 2024, Tri Dao, Albert Gu. # Adapted from https://github.com/state-spaces/mamba/blob/v2.2.4/mamba_ssm/ops/triton/ssd_chunk_state.py diff --git a/vllm/model_executor/layers/mamba/ops/ssd_combined.py b/vllm/model_executor/layers/mamba/ops/ssd_combined.py index 79a1663b85bbc..b121275e9eb38 100644 --- a/vllm/model_executor/layers/mamba/ops/ssd_combined.py +++ b/vllm/model_executor/layers/mamba/ops/ssd_combined.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright (c) 2024, Tri Dao, Albert Gu. # Adapted from https://github.com/state-spaces/mamba/blob/v2.2.4/mamba_ssm/ops/triton/ssd_combined.py diff --git a/vllm/model_executor/layers/mamba/ops/ssd_state_passing.py b/vllm/model_executor/layers/mamba/ops/ssd_state_passing.py index 6f69ca74389e9..a28fc9ffad71b 100644 --- a/vllm/model_executor/layers/mamba/ops/ssd_state_passing.py +++ b/vllm/model_executor/layers/mamba/ops/ssd_state_passing.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright (c) 2024, Tri Dao, Albert Gu. # Adapted from https://github.com/state-spaces/mamba/blob/v2.2.4/mamba_ssm/ops/triton/ssd_state_passing.py diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py index d2c42191bb3ff..258038bed40bd 100644 --- a/vllm/model_executor/layers/pooler.py +++ b/vllm/model_executor/layers/pooler.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from enum import IntEnum from typing import Optional, Union diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py index 407b9c72f41d8..1cb23e7a18875 100644 --- a/vllm/model_executor/layers/quantization/__init__.py +++ b/vllm/model_executor/layers/quantization/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Literal, get_args diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py index 8bf0ca5c0448a..2ea8c5dc51132 100644 --- a/vllm/model_executor/layers/quantization/aqlm.py +++ b/vllm/model_executor/layers/quantization/aqlm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Supports AQLM compression, see https://github.com/Vahe1994/AQLM # and https://arxiv.org/pdf/2401.06118.pdf diff --git a/vllm/model_executor/layers/quantization/auto_round.py b/vllm/model_executor/layers/quantization/auto_round.py index eb8ffa37882cb..ea17cd56c9855 100644 --- a/vllm/model_executor/layers/quantization/auto_round.py +++ b/vllm/model_executor/layers/quantization/auto_round.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from fractions import Fraction from typing import Any, Optional, Union diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py index 87afdb623d912..f8bc3ab5e7d1e 100644 --- a/vllm/model_executor/layers/quantization/awq.py +++ b/vllm/model_executor/layers/quantization/awq.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any, Optional diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py index 0c8d082bb428d..56d803c6baf12 100644 --- a/vllm/model_executor/layers/quantization/awq_marlin.py +++ b/vllm/model_executor/layers/quantization/awq_marlin.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any, Callable, Optional diff --git a/vllm/model_executor/layers/quantization/awq_triton.py b/vllm/model_executor/layers/quantization/awq_triton.py index 5e54915789792..ebc526d6db2f9 100644 --- a/vllm/model_executor/layers/quantization/awq_triton.py +++ b/vllm/model_executor/layers/quantization/awq_triton.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch diff --git a/vllm/model_executor/layers/quantization/base_config.py b/vllm/model_executor/layers/quantization/base_config.py index c9533da9d46eb..78c5c75c06515 100644 --- a/vllm/model_executor/layers/quantization/base_config.py +++ b/vllm/model_executor/layers/quantization/base_config.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import inspect from abc import ABC, abstractmethod diff --git a/vllm/model_executor/layers/quantization/bitblas.py b/vllm/model_executor/layers/quantization/bitblas.py index 1cd12bb763178..9e5ce39ec8f2e 100644 --- a/vllm/model_executor/layers/quantization/bitblas.py +++ b/vllm/model_executor/layers/quantization/bitblas.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any, Optional import torch diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py index 049ce7a7191da..38935bc967855 100644 --- a/vllm/model_executor/layers/quantization/bitsandbytes.py +++ b/vllm/model_executor/layers/quantization/bitsandbytes.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any, Optional diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index 27547f315fef3..dff62af863895 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from contextlib import suppress from typing import Any, Literal, Optional, cast diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index 9241ceeb4db29..ebb029572a139 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import enum from enum import Enum diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py index 79bf5c108ac2e..25924c733e760 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from .compressed_tensors_scheme import CompressedTensorsScheme from .compressed_tensors_w4a16_24 import (W4A16SPARSE24_SUPPORTED_BITS, diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py index f010bc03418c3..30ed55aee04f8 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any, Callable, Optional diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py index daa25d23a3060..a5d48f2356744 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import ABC, abstractmethod from typing import Optional diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py index 6ea31e50caa72..3f3e7668fcf74 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Callable, Optional diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py index cf60b34ba78a9..8202ce9514969 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Callable, Optional import torch diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py index 61e4918ca47f2..01a87a0888996 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Callable, Optional diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py index 99bb73b71e9f4..1e61e058cb84c 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Callable, Optional diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py index 7792ce86553c6..6189f0609d85d 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Callable, Optional diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py index a33c58acb045c..74787603e0029 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Callable, Optional diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py b/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py index 2380d35702c61..9bcf1aa2bc1cd 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py index 75e81c4dd49d8..402646498cee1 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable, Mapping from types import MappingProxyType diff --git a/vllm/model_executor/layers/quantization/deepspeedfp.py b/vllm/model_executor/layers/quantization/deepspeedfp.py index 0c1eaff93e8b1..8030be5259445 100644 --- a/vllm/model_executor/layers/quantization/deepspeedfp.py +++ b/vllm/model_executor/layers/quantization/deepspeedfp.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any, Optional diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py index 3601d219df3b5..01b0064f08058 100644 --- a/vllm/model_executor/layers/quantization/experts_int8.py +++ b/vllm/model_executor/layers/quantization/experts_int8.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any, Callable, Optional diff --git a/vllm/model_executor/layers/quantization/fbgemm_fp8.py b/vllm/model_executor/layers/quantization/fbgemm_fp8.py index 223682ee97650..3e465ee2cdd21 100644 --- a/vllm/model_executor/layers/quantization/fbgemm_fp8.py +++ b/vllm/model_executor/layers/quantization/fbgemm_fp8.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any, Optional diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index ac9b74945e0ce..cea4d26a4c48f 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import functools import importlib.util diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py index 1fcb6d7afc9b3..2171f729afad1 100644 --- a/vllm/model_executor/layers/quantization/gguf.py +++ b/vllm/model_executor/layers/quantization/gguf.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any, Callable, Optional diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py index 436f1e3ccc1a5..d3ab1be3bee01 100644 --- a/vllm/model_executor/layers/quantization/gptq.py +++ b/vllm/model_executor/layers/quantization/gptq.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import enum from enum import Enum diff --git a/vllm/model_executor/layers/quantization/gptq_bitblas.py b/vllm/model_executor/layers/quantization/gptq_bitblas.py index be9510abdffb3..78e0f59fa4bee 100644 --- a/vllm/model_executor/layers/quantization/gptq_bitblas.py +++ b/vllm/model_executor/layers/quantization/gptq_bitblas.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any, Optional import torch diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index cf012e145ee68..f92ebdea986da 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any, Callable, Optional, Union diff --git a/vllm/model_executor/layers/quantization/gptq_marlin_24.py b/vllm/model_executor/layers/quantization/gptq_marlin_24.py index e90416f377915..eba917d854118 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin_24.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin_24.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any, Optional diff --git a/vllm/model_executor/layers/quantization/hqq_marlin.py b/vllm/model_executor/layers/quantization/hqq_marlin.py index a8faf97723cd1..ee8a0e34b32e5 100644 --- a/vllm/model_executor/layers/quantization/hqq_marlin.py +++ b/vllm/model_executor/layers/quantization/hqq_marlin.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any, Optional diff --git a/vllm/model_executor/layers/quantization/ipex_quant.py b/vllm/model_executor/layers/quantization/ipex_quant.py index 8108c797637d4..31ad96eccaf3e 100644 --- a/vllm/model_executor/layers/quantization/ipex_quant.py +++ b/vllm/model_executor/layers/quantization/ipex_quant.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any, Optional diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py index 55ad00b1cf461..07ecc096231a4 100644 --- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py +++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import ABC, abstractmethod from dataclasses import dataclass diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py index bb1dc40ad71a7..0bf0d530d2351 100644 --- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py +++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py index e07177dd675fe..785e559df8f75 100644 --- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py +++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py index 29e20699184c5..649d07b4d0723 100644 --- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py +++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py index 50d293cf415bf..fef333e862d5a 100644 --- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py +++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py index 855867fa4a006..c7c45861875af 100644 --- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py +++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from functools import partial from typing import Optional diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py index 899011f000515..1597492a5cf65 100644 --- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py +++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py index 2d92af74bbf9a..9ebf5f3037922 100644 --- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py +++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import ABC, abstractmethod from dataclasses import dataclass diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py index 5d58c0489a286..18f5ce04fd355 100644 --- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py +++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os from typing import Optional diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py index 6c2c464e6f1b3..165548a060128 100644 --- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py +++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py index 98a0b30be1f62..6ddd4a9ec4233 100644 --- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py +++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py index c09ca83d01cbb..817565cf28277 100644 --- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py +++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py index a97b53b9d7b95..3de28af40aaa5 100644 --- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py +++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import warnings from typing import Optional diff --git a/vllm/model_executor/layers/quantization/kv_cache.py b/vllm/model_executor/layers/quantization/kv_cache.py index 67723c7c91cc5..e5604670fb4c1 100644 --- a/vllm/model_executor/layers/quantization/kv_cache.py +++ b/vllm/model_executor/layers/quantization/kv_cache.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch diff --git a/vllm/model_executor/layers/quantization/marlin.py b/vllm/model_executor/layers/quantization/marlin.py index 2437030c87717..62667db26b669 100644 --- a/vllm/model_executor/layers/quantization/marlin.py +++ b/vllm/model_executor/layers/quantization/marlin.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any, Optional diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 2abe16a08a265..3f79b203aa170 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any, Callable, Optional, Union diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py index 74bd6dc13f84a..3aa23f0682576 100644 --- a/vllm/model_executor/layers/quantization/moe_wna16.py +++ b/vllm/model_executor/layers/quantization/moe_wna16.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any, Callable, Optional diff --git a/vllm/model_executor/layers/quantization/neuron_quant.py b/vllm/model_executor/layers/quantization/neuron_quant.py index b2d6bf5dbf9cc..8040236663dd1 100644 --- a/vllm/model_executor/layers/quantization/neuron_quant.py +++ b/vllm/model_executor/layers/quantization/neuron_quant.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os from importlib.util import find_spec diff --git a/vllm/model_executor/layers/quantization/ptpc_fp8.py b/vllm/model_executor/layers/quantization/ptpc_fp8.py index 9e4fb33639b21..32ba1055f9c83 100644 --- a/vllm/model_executor/layers/quantization/ptpc_fp8.py +++ b/vllm/model_executor/layers/quantization/ptpc_fp8.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any, Optional diff --git a/vllm/model_executor/layers/quantization/qqq.py b/vllm/model_executor/layers/quantization/qqq.py index 6028b8a2ada3b..25978cb13b3ab 100644 --- a/vllm/model_executor/layers/quantization/qqq.py +++ b/vllm/model_executor/layers/quantization/qqq.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any, Optional diff --git a/vllm/model_executor/layers/quantization/quark/quark.py b/vllm/model_executor/layers/quantization/quark/quark.py index df4bfbbbcb4c0..6ae5f5c9ad46b 100644 --- a/vllm/model_executor/layers/quantization/quark/quark.py +++ b/vllm/model_executor/layers/quantization/quark/quark.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import fnmatch from typing import Any, Optional, cast diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py index aa7d725433eaf..4c2da4c8b04ee 100644 --- a/vllm/model_executor/layers/quantization/quark/quark_moe.py +++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any, Callable, Optional diff --git a/vllm/model_executor/layers/quantization/quark/schemes/__init__.py b/vllm/model_executor/layers/quantization/quark/schemes/__init__.py index d7dac17574ffe..ec09d9b2ac26f 100644 --- a/vllm/model_executor/layers/quantization/quark/schemes/__init__.py +++ b/vllm/model_executor/layers/quantization/quark/schemes/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from .quark_scheme import QuarkScheme from .quark_w4a4_mxfp4 import QuarkW4A4MXFP4 diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py index 40c8ea86d3c38..c167e949ac262 100644 --- a/vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py +++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import ABC, abstractmethod from typing import Optional diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py index 34c077b29163a..3c56251b7a009 100644 --- a/vllm/model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any, Callable, Optional diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py index 149c9093797f2..47e0a492b23b9 100644 --- a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py +++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Callable, Optional diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py index 94f9fcd56acac..ae68d5bbc2680 100644 --- a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py +++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Callable, Optional diff --git a/vllm/model_executor/layers/quantization/quark/utils.py b/vllm/model_executor/layers/quantization/quark/utils.py index 5e56bcb7564cd..99f5ec15933ab 100644 --- a/vllm/model_executor/layers/quantization/quark/utils.py +++ b/vllm/model_executor/layers/quantization/quark/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable, Mapping from types import MappingProxyType diff --git a/vllm/model_executor/layers/quantization/schema.py b/vllm/model_executor/layers/quantization/schema.py index c0be40c16affc..a108152929d9a 100644 --- a/vllm/model_executor/layers/quantization/schema.py +++ b/vllm/model_executor/layers/quantization/schema.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This file contains the Pydantic schemas for various quantization-related parameters. When a relevant quantization technique is specified, these diff --git a/vllm/model_executor/layers/quantization/torchao.py b/vllm/model_executor/layers/quantization/torchao.py index 7f9f3e643bfa2..af362f7a7d2d2 100644 --- a/vllm/model_executor/layers/quantization/torchao.py +++ b/vllm/model_executor/layers/quantization/torchao.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any, Optional import torch diff --git a/vllm/model_executor/layers/quantization/tpu_int8.py b/vllm/model_executor/layers/quantization/tpu_int8.py index 7941ec9732fed..83c8a98eac913 100644 --- a/vllm/model_executor/layers/quantization/tpu_int8.py +++ b/vllm/model_executor/layers/quantization/tpu_int8.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any, Optional diff --git a/vllm/model_executor/layers/quantization/utils/__init__.py b/vllm/model_executor/layers/quantization/utils/__init__.py index f7ee472885140..6ad56bae3dca0 100644 --- a/vllm/model_executor/layers/quantization/utils/__init__.py +++ b/vllm/model_executor/layers/quantization/utils/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from .layer_utils import replace_parameter, update_tensor_inplace diff --git a/vllm/model_executor/layers/quantization/utils/allspark_utils.py b/vllm/model_executor/layers/quantization/utils/allspark_utils.py index 97860765a9e14..1992b4d201478 100644 --- a/vllm/model_executor/layers/quantization/utils/allspark_utils.py +++ b/vllm/model_executor/layers/quantization/utils/allspark_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch diff --git a/vllm/model_executor/layers/quantization/utils/bitblas_utils.py b/vllm/model_executor/layers/quantization/utils/bitblas_utils.py index 70d24cc897e10..82ee3edfd5e19 100644 --- a/vllm/model_executor/layers/quantization/utils/bitblas_utils.py +++ b/vllm/model_executor/layers/quantization/utils/bitblas_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional import torch diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py index 4c213f2c874ea..1ebd2a8985824 100644 --- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from https://github.com/sgl-project/sglang/pull/2575 import functools diff --git a/vllm/model_executor/layers/quantization/utils/gptq_utils.py b/vllm/model_executor/layers/quantization/utils/gptq_utils.py index 36161d13b24f8..db82b0def1653 100644 --- a/vllm/model_executor/layers/quantization/utils/gptq_utils.py +++ b/vllm/model_executor/layers/quantization/utils/gptq_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from copy import deepcopy from typing import Optional, Union diff --git a/vllm/model_executor/layers/quantization/utils/int8_utils.py b/vllm/model_executor/layers/quantization/utils/int8_utils.py index 72fff3fa1aed1..a694a191745d8 100644 --- a/vllm/model_executor/layers/quantization/utils/int8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/int8_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from https://github.com/sgl-project/sglang/blob/4cb53ecd0cffceb6dee5c011a58f65997a86f151/python/sglang/srt/layers/quantization/int8_kernel.py import functools diff --git a/vllm/model_executor/layers/quantization/utils/layer_utils.py b/vllm/model_executor/layers/quantization/utils/layer_utils.py index 5acae7ca3b84f..fbc0f23acb59a 100644 --- a/vllm/model_executor/layers/quantization/utils/layer_utils.py +++ b/vllm/model_executor/layers/quantization/utils/layer_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Union diff --git a/vllm/model_executor/layers/quantization/utils/machete_utils.py b/vllm/model_executor/layers/quantization/utils/machete_utils.py index 6d840b5686123..580c36a0e2fa8 100644 --- a/vllm/model_executor/layers/quantization/utils/machete_utils.py +++ b/vllm/model_executor/layers/quantization/utils/machete_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py index e059a7ac3f926..7540a1516fcb0 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py index 13dcdc00a2156..ca10db69dc168 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py index 1f6e74244c5d4..5372c49d9838b 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py index 81112b27f53a8..b2c228c242532 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Utility functions used for tests and benchmarks""" from typing import Optional diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py index 73feb4264a8bb..1c93c364679da 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Utility functions used for tests and benchmarks""" import random diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py index 0123540fc5ddd..8a64bebae04c9 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import numpy import torch diff --git a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py index e7c95e38e9fd1..9d4a188f52dfc 100644 --- a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py +++ b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch diff --git a/vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py b/vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py index f292208311e25..6e8e98d544f8c 100644 --- a/vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py +++ b/vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch __all__ = [ diff --git a/vllm/model_executor/layers/quantization/utils/quant_utils.py b/vllm/model_executor/layers/quantization/utils/quant_utils.py index 6ba327f3db7a4..d6b96774b4e8b 100644 --- a/vllm/model_executor/layers/quantization/utils/quant_utils.py +++ b/vllm/model_executor/layers/quantization/utils/quant_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """This file is used for /tests and /benchmarks""" from collections.abc import Mapping from types import MappingProxyType diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py index eed8998fe3da5..adc67aa64952d 100644 --- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Callable, Optional, Union diff --git a/vllm/model_executor/layers/rejection_sampler.py b/vllm/model_executor/layers/rejection_sampler.py index 3db73495827c6..a6e58a77d42cd 100644 --- a/vllm/model_executor/layers/rejection_sampler.py +++ b/vllm/model_executor/layers/rejection_sampler.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from functools import cached_property from importlib.util import find_spec diff --git a/vllm/model_executor/layers/resampler.py b/vllm/model_executor/layers/resampler.py index 839688e313aae..3f2d571777c00 100644 --- a/vllm/model_executor/layers/resampler.py +++ b/vllm/model_executor/layers/resampler.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index afc0597197962..9de2338968a1c 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/huggingface/transformers/blob/v4.33.2/src/transformers/models/llama/modeling_llama.py diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index 32375db0c8f1a..08840fc40cf6a 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """A layer that samples the next tokens from the model's outputs.""" import itertools from collections.abc import Iterator diff --git a/vllm/model_executor/layers/spec_decode_base_sampler.py b/vllm/model_executor/layers/spec_decode_base_sampler.py index 969cd59b57ccc..0a36fe9be45b1 100644 --- a/vllm/model_executor/layers/spec_decode_base_sampler.py +++ b/vllm/model_executor/layers/spec_decode_base_sampler.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import abstractmethod from typing import Optional, Union diff --git a/vllm/model_executor/layers/typical_acceptance_sampler.py b/vllm/model_executor/layers/typical_acceptance_sampler.py index a14c86148e730..5dabaa5379e7b 100644 --- a/vllm/model_executor/layers/typical_acceptance_sampler.py +++ b/vllm/model_executor/layers/typical_acceptance_sampler.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch import torch.jit diff --git a/vllm/model_executor/layers/utils.py b/vllm/model_executor/layers/utils.py index 001e6aaf0cc7f..d97d842386972 100644 --- a/vllm/model_executor/layers/utils.py +++ b/vllm/model_executor/layers/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Utility methods for model layers.""" from typing import Callable, Optional diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py index 46d2075af99da..0f636d83a6dd9 100644 --- a/vllm/model_executor/layers/vocab_parallel_embedding.py +++ b/vllm/model_executor/layers/vocab_parallel_embedding.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Sequence from dataclasses import dataclass diff --git a/vllm/model_executor/model_loader/__init__.py b/vllm/model_executor/model_loader/__init__.py index a443a652d8a3f..f364371033f53 100644 --- a/vllm/model_executor/model_loader/__init__.py +++ b/vllm/model_executor/model_loader/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/vllm/model_executor/model_loader/base_loader.py b/vllm/model_executor/model_loader/base_loader.py index d619d9f25e087..5018c7d9a360b 100644 --- a/vllm/model_executor/model_loader/base_loader.py +++ b/vllm/model_executor/model_loader/base_loader.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import ABC, abstractmethod import torch diff --git a/vllm/model_executor/model_loader/bitsandbytes_loader.py b/vllm/model_executor/model_loader/bitsandbytes_loader.py index 3df835a938968..ebbb021cad645 100644 --- a/vllm/model_executor/model_loader/bitsandbytes_loader.py +++ b/vllm/model_executor/model_loader/bitsandbytes_loader.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # ruff: noqa: SIM117 import fnmatch import glob diff --git a/vllm/model_executor/model_loader/default_loader.py b/vllm/model_executor/model_loader/default_loader.py index 6946627a54d24..4624ff01ddc03 100644 --- a/vllm/model_executor/model_loader/default_loader.py +++ b/vllm/model_executor/model_loader/default_loader.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import dataclasses import glob import os diff --git a/vllm/model_executor/model_loader/dummy_loader.py b/vllm/model_executor/model_loader/dummy_loader.py index 64fa2be76d08b..f4a7da5744e04 100644 --- a/vllm/model_executor/model_loader/dummy_loader.py +++ b/vllm/model_executor/model_loader/dummy_loader.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch.nn as nn from vllm.config import LoadConfig, ModelConfig diff --git a/vllm/model_executor/model_loader/gguf_loader.py b/vllm/model_executor/model_loader/gguf_loader.py index 1eac504227e25..203c80760145a 100644 --- a/vllm/model_executor/model_loader/gguf_loader.py +++ b/vllm/model_executor/model_loader/gguf_loader.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os from collections.abc import Generator diff --git a/vllm/model_executor/model_loader/neuron.py b/vllm/model_executor/model_loader/neuron.py index e65d16cae76cb..fad97aba84b6a 100644 --- a/vllm/model_executor/model_loader/neuron.py +++ b/vllm/model_executor/model_loader/neuron.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Utilities for selecting and loading Neuron models in transformers-neuronx framework.""" import ast diff --git a/vllm/model_executor/model_loader/neuronx_distributed.py b/vllm/model_executor/model_loader/neuronx_distributed.py index 72ad4da296ac6..f450961c64ff4 100644 --- a/vllm/model_executor/model_loader/neuronx_distributed.py +++ b/vllm/model_executor/model_loader/neuronx_distributed.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Utilities for selecting and loading Neuron models in neuronx-distributed-inference framework.""" # Disabling yapf because yapf and isort have conflicts for the below imports diff --git a/vllm/model_executor/model_loader/runai_streamer_loader.py b/vllm/model_executor/model_loader/runai_streamer_loader.py index a39e26c6da50d..83e0f386c1082 100644 --- a/vllm/model_executor/model_loader/runai_streamer_loader.py +++ b/vllm/model_executor/model_loader/runai_streamer_loader.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # ruff: noqa: SIM117 import glob import os diff --git a/vllm/model_executor/model_loader/sharded_state_loader.py b/vllm/model_executor/model_loader/sharded_state_loader.py index b5a5031bb6f91..2fd9cfba3f61a 100644 --- a/vllm/model_executor/model_loader/sharded_state_loader.py +++ b/vllm/model_executor/model_loader/sharded_state_loader.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import collections import glob diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py index 90c0bdf08ef88..24d1e136539a7 100644 --- a/vllm/model_executor/model_loader/tensorizer.py +++ b/vllm/model_executor/model_loader/tensorizer.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import contextlib diff --git a/vllm/model_executor/model_loader/tensorizer_loader.py b/vllm/model_executor/model_loader/tensorizer_loader.py index 1923e040af381..b9982f312fe52 100644 --- a/vllm/model_executor/model_loader/tensorizer_loader.py +++ b/vllm/model_executor/model_loader/tensorizer_loader.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # ruff: noqa: SIM117 import copy from collections.abc import Generator diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py index 9c8d647a24fea..e6eaade090275 100644 --- a/vllm/model_executor/model_loader/utils.py +++ b/vllm/model_executor/model_loader/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Utilities for selecting and loading models.""" import contextlib import inspect diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index 7a9a68be8805e..857f4bca68245 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Utilities for downloading and initializing model weights.""" import fnmatch import glob diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py index 3580c4fa52525..27c169d2d1e81 100644 --- a/vllm/model_executor/models/__init__.py +++ b/vllm/model_executor/models/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from .interfaces import (HasInnerState, SupportsLoRA, SupportsMultiModal, SupportsPP, SupportsV0Only, has_inner_state, diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py index 6ab03c40ab4a2..1651e3e429e64 100644 --- a/vllm/model_executor/models/adapters.py +++ b/vllm/model_executor/models/adapters.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable from typing import TYPE_CHECKING, Any, Optional, TypeVar diff --git a/vllm/model_executor/models/aimv2.py b/vllm/model_executor/models/aimv2.py index 2e2a18abd03dd..b13d863ebb744 100644 --- a/vllm/model_executor/models/aimv2.py +++ b/vllm/model_executor/models/aimv2.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # A modified implementation of the AIMv2 Transformer # inserted here also the image tokenizer used by Ovis2 diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py index 94a4328564bbb..4693c9487a8bf 100644 --- a/vllm/model_executor/models/arctic.py +++ b/vllm/model_executor/models/arctic.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Inference-only Snowflake Arctic model.""" from collections.abc import Iterable from typing import Optional, Union diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py index f74e13888c48e..bb4177dfc4574 100644 --- a/vllm/model_executor/models/aria.py +++ b/vllm/model_executor/models/aria.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable, Mapping, Sequence from typing import Optional, TypedDict, Union diff --git a/vllm/model_executor/models/aya_vision.py b/vllm/model_executor/models/aya_vision.py index 08d49d71eca12..22efb707af738 100644 --- a/vllm/model_executor/models/aya_vision.py +++ b/vllm/model_executor/models/aya_vision.py @@ -1,4 +1,5 @@ -# SPDX-License-Identifier: Apache-2.0 Adapted from +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project Adapted from # https://github.com/huggingface/transformers/tree/main/src/transformers/models/aya_vision from collections.abc import Iterable, Mapping, Sequence from typing import Literal, Optional, TypedDict, Union, cast diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py index bcff6eb3fd315..0de5de5e835ac 100644 --- a/vllm/model_executor/models/baichuan.py +++ b/vllm/model_executor/models/baichuan.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. # diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py index d6a705fb1859a..29e0e2a2edb15 100644 --- a/vllm/model_executor/models/bamba.py +++ b/vllm/model_executor/models/bamba.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Inference-only Bamba model.""" # Added by the IBM Team, 2024 from collections.abc import Iterable diff --git a/vllm/model_executor/models/bart.py b/vllm/model_executor/models/bart.py index 92bbe1bb67a3c..a0ec12674f19b 100644 --- a/vllm/model_executor/models/bart.py +++ b/vllm/model_executor/models/bart.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Derived from BART implementation posted on HuggingFace; license below: # diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py index 0b1d0f1034083..389393987c811 100644 --- a/vllm/model_executor/models/bert.py +++ b/vllm/model_executor/models/bert.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable from typing import Optional diff --git a/vllm/model_executor/models/bert_with_rope.py b/vllm/model_executor/models/bert_with_rope.py index 8a387d71f1cb0..0f22393c79d98 100644 --- a/vllm/model_executor/models/bert_with_rope.py +++ b/vllm/model_executor/models/bert_with_rope.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable from copy import deepcopy from typing import Optional diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py index acbc5d04d7e35..2b457fd8a5b25 100644 --- a/vllm/model_executor/models/blip.py +++ b/vllm/model_executor/models/blip.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Minimal implementation of BlipVisionModel intended to be only used within a vision language model.""" from collections.abc import Iterable diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index db0dd2051d527..279541bed55a0 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable, Mapping, Sequence from typing import Literal, Optional, TypedDict, Union diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py index 10424e218fbcc..6e4a399f3cc6e 100644 --- a/vllm/model_executor/models/bloom.py +++ b/vllm/model_executor/models/bloom.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/bloom/modeling_bloom.py diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index a4528ca26d010..aea44261dd69f 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable, Mapping, Sequence from functools import cached_property diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index 4e95afe1a1474..129f0942f14ef 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/THUDM/ChatGLM2-6B """Inference-only ChatGLM model compatible with THUDM weights.""" diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py index 9fd528fd79779..dcab008228704 100644 --- a/vllm/model_executor/models/clip.py +++ b/vllm/model_executor/models/clip.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Minimal implementation of CLIPVisionModel intended to be only used within a vision language model.""" from collections.abc import Iterable diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index 546b5f932877d..ee67cc64050e7 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright 2024 Cohere and the HuggingFace Inc. team. All rights reserved. # diff --git a/vllm/model_executor/models/constant_size_cache.py b/vllm/model_executor/models/constant_size_cache.py index f1cc7e0f9e293..f03c58a12932f 100644 --- a/vllm/model_executor/models/constant_size_cache.py +++ b/vllm/model_executor/models/constant_size_cache.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import ABC, abstractmethod from typing import Any diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py index f21887f71d857..7a4dd69443ad7 100644 --- a/vllm/model_executor/models/dbrx.py +++ b/vllm/model_executor/models/dbrx.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable from typing import Optional, Union diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py index 88d1ca9f7b833..2f0202f1e038d 100644 --- a/vllm/model_executor/models/deepseek.py +++ b/vllm/model_executor/models/deepseek.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py diff --git a/vllm/model_executor/models/deepseek_mtp.py b/vllm/model_executor/models/deepseek_mtp.py index 03ef7bed0edcf..6e6e74b0d1d9b 100644 --- a/vllm/model_executor/models/deepseek_mtp.py +++ b/vllm/model_executor/models/deepseek_mtp.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable from typing import Optional diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index b78c193c1345a..0f996d04e6e80 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py index 5c8793f59ffbe..765718e575203 100644 --- a/vllm/model_executor/models/deepseek_vl2.py +++ b/vllm/model_executor/models/deepseek_vl2.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/faf18023f24b962b32d9f0a2d89e402a8d383a78/deepseek_vl2/models/modeling_deepseek_vl_v2.py """Inference-only Deepseek-VL2 model compatible with HuggingFace weights.""" diff --git a/vllm/model_executor/models/eagle.py b/vllm/model_executor/models/eagle.py index fb1675d29915d..2219321457b2a 100644 --- a/vllm/model_executor/models/eagle.py +++ b/vllm/model_executor/models/eagle.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable from typing import Optional diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py index 838560692bcf5..aaf105ec2552a 100644 --- a/vllm/model_executor/models/exaone.py +++ b/vllm/model_executor/models/exaone.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/blob/main/modeling_exaone.py diff --git a/vllm/model_executor/models/fairseq2_llama.py b/vllm/model_executor/models/fairseq2_llama.py index 00dbbebb120e8..d78ee100b26df 100644 --- a/vllm/model_executor/models/fairseq2_llama.py +++ b/vllm/model_executor/models/fairseq2_llama.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright 2024 The vLLM team. # Copyright 2024 Meta Platforms, Inc. and affiliates. All rights reserved. diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py index 376793594f8ba..62a93dabd5d7f 100644 --- a/vllm/model_executor/models/falcon.py +++ b/vllm/model_executor/models/falcon.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/huggingface/transformers/blob/a5cc30d72ae2dc19af534e4b35c986cc28db1275/src/transformers/models/falcon/modeling_falcon.py diff --git a/vllm/model_executor/models/falcon_h1.py b/vllm/model_executor/models/falcon_h1.py index 1c0e3911fccee..28f257eabed01 100644 --- a/vllm/model_executor/models/falcon_h1.py +++ b/vllm/model_executor/models/falcon_h1.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Inference-only FalconH1 model.""" from collections.abc import Iterable from typing import Optional diff --git a/vllm/model_executor/models/florence2.py b/vllm/model_executor/models/florence2.py index f8acc56706d2b..47760aabb9591 100644 --- a/vllm/model_executor/models/florence2.py +++ b/vllm/model_executor/models/florence2.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import math from collections import OrderedDict diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index fbad7f56d0ba7..cb141dbc5aa37 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # adapted from https://github.com/huggingface/transformers/blob/v4.39.3/src/transformers/models/fuyu/modeling_fuyu.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py index 0f6d94e7518bb..99ed51f8e70af 100644 --- a/vllm/model_executor/models/gemma.py +++ b/vllm/model_executor/models/gemma.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright 2023 The vLLM team. # Copyright (c) Google Inc. diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py index b46716213c626..ce405041b3d4a 100644 --- a/vllm/model_executor/models/gemma2.py +++ b/vllm/model_executor/models/gemma2.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright 2024 The vLLM team. # Copyright 2024 Google Inc. HuggingFace Inc. team. All rights reserved. diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py index 3a88adcce0bdd..e19e0026b3f99 100644 --- a/vllm/model_executor/models/gemma3.py +++ b/vllm/model_executor/models/gemma3.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright 2025 The vLLM team. # Copyright 2025 Google Inc. HuggingFace Inc. team. All rights reserved. # diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py index 182cc86d3ca8f..23e25170799ba 100644 --- a/vllm/model_executor/models/gemma3_mm.py +++ b/vllm/model_executor/models/gemma3_mm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import math from collections.abc import Iterable, Mapping, Sequence from typing import Any, Literal, Optional, TypedDict diff --git a/vllm/model_executor/models/glm.py b/vllm/model_executor/models/glm.py index 6269ebcee5c08..defa77b84e441 100644 --- a/vllm/model_executor/models/glm.py +++ b/vllm/model_executor/models/glm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Inference-only HF format GLM-4 model compatible with THUDM weights.""" from vllm.config import VllmConfig from vllm.model_executor.models.llama import LlamaForCausalLM diff --git a/vllm/model_executor/models/glm4.py b/vllm/model_executor/models/glm4.py index f351ce5a06810..5e2908a82c418 100644 --- a/vllm/model_executor/models/glm4.py +++ b/vllm/model_executor/models/glm4.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright 2025 The Zhipu AI team. # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py index 4e13716719ace..034c7654f4d94 100644 --- a/vllm/model_executor/models/glm4v.py +++ b/vllm/model_executor/models/glm4v.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/THUDM/CogAgent diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py index c2c310fca4d94..fd3decbaebec4 100644 --- a/vllm/model_executor/models/gpt2.py +++ b/vllm/model_executor/models/gpt2.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt2/modeling_gpt2.py diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py index c4ae4fc3c0062..661a67bdc0db0 100644 --- a/vllm/model_executor/models/gpt_bigcode.py +++ b/vllm/model_executor/models/gpt_bigcode.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt2/modeling_gpt2.py diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py index 69fdd90cfbe8b..bd162a5e57bc1 100644 --- a/vllm/model_executor/models/gpt_j.py +++ b/vllm/model_executor/models/gpt_j.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gptj/modeling_gptj.py diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py index 401fa9f5cc8bc..d418d8bb86cee 100644 --- a/vllm/model_executor/models/gpt_neox.py +++ b/vllm/model_executor/models/gpt_neox.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt_neox/modeling_gpt_neox.py diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py index 3524d036db222..bd4d5d0b6b28a 100644 --- a/vllm/model_executor/models/granite.py +++ b/vllm/model_executor/models/granite.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py diff --git a/vllm/model_executor/models/granite_speech.py b/vllm/model_executor/models/granite_speech.py index fd8fb48c50e3a..831164ba88a4d 100644 --- a/vllm/model_executor/models/granite_speech.py +++ b/vllm/model_executor/models/granite_speech.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py index f342dfff824f0..5a70f3a616c6d 100644 --- a/vllm/model_executor/models/granitemoe.py +++ b/vllm/model_executor/models/granitemoe.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py index 443b102c99680..f434b7a74e486 100644 --- a/vllm/model_executor/models/granitemoehybrid.py +++ b/vllm/model_executor/models/granitemoehybrid.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Inference-only GraniteMoeHybrid model.""" # Added by the IBM Team, 2025 from collections.abc import Iterable diff --git a/vllm/model_executor/models/granitemoeshared.py b/vllm/model_executor/models/granitemoeshared.py index 817e6091d276a..bb160dbce45b2 100644 --- a/vllm/model_executor/models/granitemoeshared.py +++ b/vllm/model_executor/models/granitemoeshared.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Inference-only GraniteMoeShared model. The architecture is the same as granitemoe but with the addition of shared diff --git a/vllm/model_executor/models/gritlm.py b/vllm/model_executor/models/gritlm.py index 6a444e8d1068c..4273afbf46998 100644 --- a/vllm/model_executor/models/gritlm.py +++ b/vllm/model_executor/models/gritlm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from array import array from typing import Optional diff --git a/vllm/model_executor/models/grok1.py b/vllm/model_executor/models/grok1.py index bc9e9a3c02064..2d930527b2be0 100644 --- a/vllm/model_executor/models/grok1.py +++ b/vllm/model_executor/models/grok1.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/ROCm/vllm/blob/cea7419f151cc50293a05b7fac8547f8f887c9f6/vllm/model_executor/models/grok1.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py index 904f5330c653e..8f7f359b75521 100644 --- a/vllm/model_executor/models/h2ovl.py +++ b/vllm/model_executor/models/h2ovl.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # adapted from https://huggingface.co/h2oai/h2ovl-mississippi-2b/blob/main/modeling_h2ovl_chat.py # https://huggingface.co/h2oai/h2ovl-mississippi-2b/blob/main/image_process.py diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py index b8bdc7aa32b25..9e27200fb1c89 100644 --- a/vllm/model_executor/models/idefics2_vision_model.py +++ b/vllm/model_executor/models/idefics2_vision_model.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # adapted from https://github.com/huggingface/transformers/blob/v4.43.2/src/transformers/models/idefics2/modeling_idefics2.py # Copyright 2024 The vLLM team. diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py index fdb128ef5b541..4bc5e2a0cfaea 100644 --- a/vllm/model_executor/models/idefics3.py +++ b/vllm/model_executor/models/idefics3.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright 2024 the HuggingFace Inc. team. All rights reserved. # diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index 8be8841c1f6c9..cb2a4062b84cf 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import (TYPE_CHECKING, ClassVar, Literal, Optional, Protocol, Union, overload, runtime_checkable) diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py index d325a6b671328..4a1ea74a218a4 100644 --- a/vllm/model_executor/models/interfaces_base.py +++ b/vllm/model_executor/models/interfaces_base.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import (TYPE_CHECKING, Optional, Protocol, Union, overload, runtime_checkable) diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py index 538e9de4f78fc..58e8163e0b26e 100644 --- a/vllm/model_executor/models/intern_vit.py +++ b/vllm/model_executor/models/intern_vit.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # adapted from https://huggingface.co/OpenGVLab/InternVL2-4B/blob/main/modeling_intern_vit.py # -------------------------------------------------------- diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py index 3f3e3966e838a..e8549b4e05384 100644 --- a/vllm/model_executor/models/internlm2.py +++ b/vllm/model_executor/models/internlm2.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable from functools import partial diff --git a/vllm/model_executor/models/internlm2_ve.py b/vllm/model_executor/models/internlm2_ve.py index 6893d0239121d..4bbb49da0e96f 100644 --- a/vllm/model_executor/models/internlm2_ve.py +++ b/vllm/model_executor/models/internlm2_ve.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional, Union diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index c37d3afb4e440..0c61369c5f518 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # adapted from https://huggingface.co/OpenGVLab/InternVL2-4B/blob/main/modeling_internvl_chat.py # -------------------------------------------------------- diff --git a/vllm/model_executor/models/jais.py b/vllm/model_executor/models/jais.py index d6a1e0bb48454..bed4a5dff2efa 100644 --- a/vllm/model_executor/models/jais.py +++ b/vllm/model_executor/models/jais.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://huggingface.co/inceptionai/jais-30b-chat-v3/blob/main/modeling_jais.py diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py index 6f9fa60c9b05e..8294f846bbd10 100644 --- a/vllm/model_executor/models/jamba.py +++ b/vllm/model_executor/models/jamba.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Inference-only Jamba model.""" from collections.abc import Iterable from typing import Optional diff --git a/vllm/model_executor/models/kimi_vl.py b/vllm/model_executor/models/kimi_vl.py index b575f44765a89..351d1fbdc7444 100644 --- a/vllm/model_executor/models/kimi_vl.py +++ b/vllm/model_executor/models/kimi_vl.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # ruff: noqa: E501 # Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/modeling_kimi_vl.py # Copyright 2025 The Moonshot AI Team, DeepSeek-AI, and HuggingFace Inc. team. All rights reserved. diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index d36b6466c0bb9..5d5080479e510 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py index 40fdd84d8fb08..a852be66bde82 100644 --- a/vllm/model_executor/models/llama4.py +++ b/vllm/model_executor/models/llama4.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # # Copyright 2025 the LLAMA4, Meta Inc., vLLM, and HuggingFace Inc. team. # All rights reserved. diff --git a/vllm/model_executor/models/llama_eagle.py b/vllm/model_executor/models/llama_eagle.py index 172dc8b5ec06a..f73b863fef23d 100644 --- a/vllm/model_executor/models/llama_eagle.py +++ b/vllm/model_executor/models/llama_eagle.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable diff --git a/vllm/model_executor/models/llama_eagle3.py b/vllm/model_executor/models/llama_eagle3.py index 1e40017fc792a..d31a321b876aa 100644 --- a/vllm/model_executor/models/llama_eagle3.py +++ b/vllm/model_executor/models/llama_eagle3.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable from typing import Optional diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index ced71b6dcdebe..725e1b2c19481 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import abstractmethod from collections.abc import Iterable, Mapping, Sequence diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index 2fb79f57a67f1..6f5f231875de5 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import abstractmethod from collections.abc import Iterable, Mapping diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py index 9303ea1217273..a3406d090db85 100644 --- a/vllm/model_executor/models/llava_next_video.py +++ b/vllm/model_executor/models/llava_next_video.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import math from collections.abc import Iterable, Mapping, Sequence diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index 7ea759fd59b82..d90d3d4a0960d 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import math from collections.abc import Iterable, Mapping, Sequence diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py index ce76a76b65743..8162ac3f7597d 100644 --- a/vllm/model_executor/models/mamba.py +++ b/vllm/model_executor/models/mamba.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """PyTorch MAMBA model.""" from collections.abc import Iterable from typing import Optional diff --git a/vllm/model_executor/models/mamba2.py b/vllm/model_executor/models/mamba2.py index 65c6467bcf5fb..cf9e1bd03e986 100644 --- a/vllm/model_executor/models/mamba2.py +++ b/vllm/model_executor/models/mamba2.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """PyTorch MAMBA2 model.""" from collections.abc import Iterable from typing import Optional diff --git a/vllm/model_executor/models/mamba_cache.py b/vllm/model_executor/models/mamba_cache.py index 47d0ef9cc6bb1..49ba974c69a5e 100644 --- a/vllm/model_executor/models/mamba_cache.py +++ b/vllm/model_executor/models/mamba_cache.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass diff --git a/vllm/model_executor/models/medusa.py b/vllm/model_executor/models/medusa.py index 95ef1134b1bf9..709a5a993c6f7 100644 --- a/vllm/model_executor/models/medusa.py +++ b/vllm/model_executor/models/medusa.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable from typing import Optional diff --git a/vllm/model_executor/models/mimo.py b/vllm/model_executor/models/mimo.py index 49ea64e029d63..9b83f848ef428 100644 --- a/vllm/model_executor/models/mimo.py +++ b/vllm/model_executor/models/mimo.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/qwen2/modeling_qwen2.py diff --git a/vllm/model_executor/models/mimo_mtp.py b/vllm/model_executor/models/mimo_mtp.py index cbca6a4c8f9d2..6066ec76c5fc0 100644 --- a/vllm/model_executor/models/mimo_mtp.py +++ b/vllm/model_executor/models/mimo_mtp.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/vllm-project/vllm/blob/v0.7.3/vllm/model_executor/models/deepseek_mtp.py diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py index f471a86ffba34..d398a5d12bbcd 100644 --- a/vllm/model_executor/models/minicpm.py +++ b/vllm/model_executor/models/minicpm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py diff --git a/vllm/model_executor/models/minicpm3.py b/vllm/model_executor/models/minicpm3.py index 2a6867d12d993..92c13e81bf3e4 100644 --- a/vllm/model_executor/models/minicpm3.py +++ b/vllm/model_executor/models/minicpm3.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py diff --git a/vllm/model_executor/models/minicpm_eagle.py b/vllm/model_executor/models/minicpm_eagle.py index 039c3d22d1604..06c2eb4e80afb 100644 --- a/vllm/model_executor/models/minicpm_eagle.py +++ b/vllm/model_executor/models/minicpm_eagle.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py diff --git a/vllm/model_executor/models/minicpmo.py b/vllm/model_executor/models/minicpmo.py index ae5df0f9273f6..ff5959ed196ea 100644 --- a/vllm/model_executor/models/minicpmo.py +++ b/vllm/model_executor/models/minicpmo.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 04cc7e35e3450..4100fee0ec841 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py diff --git a/vllm/model_executor/models/minimax_cache.py b/vllm/model_executor/models/minimax_cache.py index c95cbb419eb95..9164ac06a3b0a 100644 --- a/vllm/model_executor/models/minimax_cache.py +++ b/vllm/model_executor/models/minimax_cache.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass import torch diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py index ac0fe7b10c836..02800449bda3c 100644 --- a/vllm/model_executor/models/minimax_text_01.py +++ b/vllm/model_executor/models/minimax_text_01.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Inference-only MiniMaxText01 model.""" import copy import math diff --git a/vllm/model_executor/models/minimax_vl_01.py b/vllm/model_executor/models/minimax_vl_01.py index 14c1250ca3b42..b2ededcaf67ce 100644 --- a/vllm/model_executor/models/minimax_vl_01.py +++ b/vllm/model_executor/models/minimax_vl_01.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable, Mapping from typing import Literal, Optional, TypedDict, Union, cast diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py index 051a73120838e..9147240b2b2a9 100644 --- a/vllm/model_executor/models/mistral3.py +++ b/vllm/model_executor/models/mistral3.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import abstractmethod from collections.abc import Iterable, Mapping, Sequence diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index 9bc7a16153e1f..dec365119c725 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py index 8220200d270c2..3183c762d2b14 100644 --- a/vllm/model_executor/models/mixtral_quant.py +++ b/vllm/model_executor/models/mixtral_quant.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py index 713c9e8d203fa..e9f91feb3359d 100644 --- a/vllm/model_executor/models/mllama.py +++ b/vllm/model_executor/models/mllama.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright 2024 the HuggingFace Inc. team. All rights reserved. # diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py index 58549b10e9666..54fae279d531d 100644 --- a/vllm/model_executor/models/mllama4.py +++ b/vllm/model_executor/models/mllama4.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # # Copyright 2025 the LLAMA4, Meta Inc., vLLM, and HuggingFace Inc. team. # All rights reserved. diff --git a/vllm/model_executor/models/mlp_speculator.py b/vllm/model_executor/models/mlp_speculator.py index a7d7aa7d44ef2..c6a97388dc188 100644 --- a/vllm/model_executor/models/mlp_speculator.py +++ b/vllm/model_executor/models/mlp_speculator.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import math from collections.abc import Iterable diff --git a/vllm/model_executor/models/modernbert.py b/vllm/model_executor/models/modernbert.py index 18eab6051736f..35f416a6e21e8 100644 --- a/vllm/model_executor/models/modernbert.py +++ b/vllm/model_executor/models/modernbert.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable from typing import Optional diff --git a/vllm/model_executor/models/module_mapping.py b/vllm/model_executor/models/module_mapping.py index 25e6f594069ef..11a2a384c165e 100644 --- a/vllm/model_executor/models/module_mapping.py +++ b/vllm/model_executor/models/module_mapping.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/modelscope/ms-swift/blob/v2.4.2/swift/utils/module_mapping.py diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index 640a2049a6293..1fa76b9ac7afa 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import math from collections.abc import Iterable, Mapping, Sequence diff --git a/vllm/model_executor/models/moonvit.py b/vllm/model_executor/models/moonvit.py index 9f11d4a422733..d0fdab13ef0c9 100644 --- a/vllm/model_executor/models/moonvit.py +++ b/vllm/model_executor/models/moonvit.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # ruff: noqa: E501 # Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/modeling_kimi_vl.py # This file is meant to be used in kimi_vl.py only diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py index 6c396d778ae71..0878ada34d1d8 100644 --- a/vllm/model_executor/models/mpt.py +++ b/vllm/model_executor/models/mpt.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from https://huggingface.co/mosaicml/mpt-7b/tree/main import math diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py index d0999e30e1ba4..eabf47b1aede4 100644 --- a/vllm/model_executor/models/nemotron.py +++ b/vllm/model_executor/models/nemotron.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py diff --git a/vllm/model_executor/models/nemotron_nas.py b/vllm/model_executor/models/nemotron_nas.py index 9808fe05558e2..a766ed9476a65 100644 --- a/vllm/model_executor/models/nemotron_nas.py +++ b/vllm/model_executor/models/nemotron_nas.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py diff --git a/vllm/model_executor/models/nvlm_d.py b/vllm/model_executor/models/nvlm_d.py index 172434e66ae2c..2f7f8e437f0ad 100644 --- a/vllm/model_executor/models/nvlm_d.py +++ b/vllm/model_executor/models/nvlm_d.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # adapted from https://huggingface.co/nvidia/NVLM-D-72B/blob/main/modeling_nvlm_d.py # -------------------------------------------------------- diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py index fcb7c619a1020..1dc4df85c1bc4 100644 --- a/vllm/model_executor/models/olmo.py +++ b/vllm/model_executor/models/olmo.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/huggingface/transformers/blob/v4.40.1/src/transformers/models/olmo/modeling_olmo.py diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py index 33adacdae5f5b..499e6d30ed6b0 100644 --- a/vllm/model_executor/models/olmo2.py +++ b/vllm/model_executor/models/olmo2.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/huggingface/transformers/blob/main/src/transformers/models/olmo2/modeling_olmo2.py diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py index af289455527ce..ebfdb690fe29b 100644 --- a/vllm/model_executor/models/olmoe.py +++ b/vllm/model_executor/models/olmoe.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py index 8376d62410d4b..9eaac1e28dcd8 100644 --- a/vllm/model_executor/models/opt.py +++ b/vllm/model_executor/models/opt.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/opt/modeling_opt.py diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py index da2a194e6bdf4..d121188ba5d4a 100644 --- a/vllm/model_executor/models/orion.py +++ b/vllm/model_executor/models/orion.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://huggingface.co/OrionStarAI/Orion-14B-Base/blob/main/modeling_orion.py diff --git a/vllm/model_executor/models/ovis.py b/vllm/model_executor/models/ovis.py index 232a63c506890..5c11d54c61247 100644 --- a/vllm/model_executor/models/ovis.py +++ b/vllm/model_executor/models/ovis.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # adapted from https://github.com/huggingface/transformers/blob/v4.39.3/src/transformers/models/ovis/modeling_ovis.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py index 427005e9b7041..a0e2912578c51 100644 --- a/vllm/model_executor/models/paligemma.py +++ b/vllm/model_executor/models/paligemma.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable, Mapping, Sequence from typing import Literal, Optional, TypedDict, Union diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py index d46b95fea5a8a..f8db99eb92ba8 100644 --- a/vllm/model_executor/models/persimmon.py +++ b/vllm/model_executor/models/persimmon.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # adapted from https://github.com/huggingface/transformers/blob/v4.39.3/src/transformers/models/persimmon/modeling_persimmon.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py index 330ad5c59448b..21d517b3a490f 100644 --- a/vllm/model_executor/models/phi.py +++ b/vllm/model_executor/models/phi.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://huggingface.co/microsoft/phi-1_5/blob/main/modeling_phi.py diff --git a/vllm/model_executor/models/phi3.py b/vllm/model_executor/models/phi3.py index 8f84e0726951d..f4e870c530309 100644 --- a/vllm/model_executor/models/phi3.py +++ b/vllm/model_executor/models/phi3.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from llama.py """Inference-only Phi3 model code inherit from Llama.py""" diff --git a/vllm/model_executor/models/phi3_small.py b/vllm/model_executor/models/phi3_small.py index d00d7d886d671..533655fd52004 100644 --- a/vllm/model_executor/models/phi3_small.py +++ b/vllm/model_executor/models/phi3_small.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import math from collections.abc import Iterable diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index b757e661d7712..376c53d2cb99a 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright 2024 The vLLM team. # Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved. diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py index 418ff900ffd52..924e6436897d4 100644 --- a/vllm/model_executor/models/phi4mm.py +++ b/vllm/model_executor/models/phi4mm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import math from collections.abc import Iterable, Mapping, Sequence from typing import Any, Literal, Optional, TypedDict, Union diff --git a/vllm/model_executor/models/phi4mm_audio.py b/vllm/model_executor/models/phi4mm_audio.py index 98cef75069ae2..ae7a8a732c446 100644 --- a/vllm/model_executor/models/phi4mm_audio.py +++ b/vllm/model_executor/models/phi4mm_audio.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. # Code copied from Microsoft/MoE by Jacob Platin (jacobplatin@microsoft.com) diff --git a/vllm/model_executor/models/phi4mm_utils.py b/vllm/model_executor/models/phi4mm_utils.py index f468fdbd5417f..c4890d8427e2a 100644 --- a/vllm/model_executor/models/phi4mm_utils.py +++ b/vllm/model_executor/models/phi4mm_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. # Code copied from Microsoft/MoE by Jacob Platin (jacobplatin@microsoft.com) diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py index d9917c26d1b12..dddd19c7462be 100644 --- a/vllm/model_executor/models/phimoe.py +++ b/vllm/model_executor/models/phimoe.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index 9f28d4cef4251..705586b6a6ea6 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import math from collections.abc import Iterable, Mapping, Sequence diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py index 55a65f8078a4d..670576c68efdd 100644 --- a/vllm/model_executor/models/plamo2.py +++ b/vllm/model_executor/models/plamo2.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Inference-only PLaMo2 model.""" import math from collections.abc import Iterable diff --git a/vllm/model_executor/models/prithvi_geospatial_mae.py b/vllm/model_executor/models/prithvi_geospatial_mae.py index 40ac5e30a368b..4fdcae5de644a 100644 --- a/vllm/model_executor/models/prithvi_geospatial_mae.py +++ b/vllm/model_executor/models/prithvi_geospatial_mae.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright 2025 The vLLM team. # Copyright 2025 IBM. diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index 2fda87a4ff0f6..e804f03e014e1 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://huggingface.co/Qwen/Qwen-7B/blob/main/modeling_qwen.py diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index a664864ff898f..23f65b99c22ce 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/qwen2/modeling_qwen2.py diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py index d89b822dd8739..7172394e42005 100644 --- a/vllm/model_executor/models/qwen2_5_omni_thinker.py +++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright 2024 The Qwen team. # Copyright 2023 The vLLM team. # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index f62c7e1d2ee16..7770ec711ce78 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/huggingface/transformers/blob/main/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index 3182a75325787..6951630c6f231 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright 2024 The Qwen team. # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py index 143b9f98b0293..a2c65f4b5edb4 100644 --- a/vllm/model_executor/models/qwen2_moe.py +++ b/vllm/model_executor/models/qwen2_moe.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py diff --git a/vllm/model_executor/models/qwen2_rm.py b/vllm/model_executor/models/qwen2_rm.py index 81dc38988c9d9..76d7ecdd1272b 100644 --- a/vllm/model_executor/models/qwen2_rm.py +++ b/vllm/model_executor/models/qwen2_rm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://huggingface.co/Qwen/Qwen2.5-Math-RM-72B/blob/main/modeling_qwen2_rm.py diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 5c30e36c7ce3a..a4f8a361ec710 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/huggingface/transformers/blob/19e6e80e10118f855137b90740936c0b11ac397f/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py index dbe2be8a73d59..393ce41a91a00 100644 --- a/vllm/model_executor/models/qwen3.py +++ b/vllm/model_executor/models/qwen3.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright 2024 The Qwen team. # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py index 8a4c2850dda3a..823197fc93503 100644 --- a/vllm/model_executor/models/qwen3_moe.py +++ b/vllm/model_executor/models/qwen3_moe.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright 2024 The Qwen team. # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py index f5d242fdf1c26..e828ce9c98499 100644 --- a/vllm/model_executor/models/qwen_vl.py +++ b/vllm/model_executor/models/qwen_vl.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://huggingface.co/Qwen/Qwen-VL/blob/main/modeling_qwen.py diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index fcef457a78291..57d1b7c53ff60 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Whenever you add an architecture to this page, please also update `tests/models/registry.py` with example HuggingFace models for it. diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py index 76008b72941da..8fa8b89798d00 100644 --- a/vllm/model_executor/models/roberta.py +++ b/vllm/model_executor/models/roberta.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import itertools from collections.abc import Iterable diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py index 4803da2956ef1..3630f59f53e0a 100644 --- a/vllm/model_executor/models/siglip.py +++ b/vllm/model_executor/models/siglip.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Implementation of SiglipVisionModel intended to be only used within a vision language model.""" diff --git a/vllm/model_executor/models/skyworkr1v.py b/vllm/model_executor/models/skyworkr1v.py index eefadda918f62..08c47facad974 100644 --- a/vllm/model_executor/models/skyworkr1v.py +++ b/vllm/model_executor/models/skyworkr1v.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # adapted from https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/modeling_skywork_chat.py # -------------------------------------------------------- diff --git a/vllm/model_executor/models/smolvlm.py b/vllm/model_executor/models/smolvlm.py index 31dec55026bae..0f22ba5b406ce 100644 --- a/vllm/model_executor/models/smolvlm.py +++ b/vllm/model_executor/models/smolvlm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py index fcd17cc1c2ba4..8dd52f1d204a5 100644 --- a/vllm/model_executor/models/solar.py +++ b/vllm/model_executor/models/solar.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py index 86ce813ddf3dd..d6ec743ce845e 100644 --- a/vllm/model_executor/models/stablelm.py +++ b/vllm/model_executor/models/stablelm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright 2023 Stability AI, EleutherAI, and The HuggingFace Inc. team. # All rights reserved. diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py index f4ba5a8030e52..9d9a2bff0e43f 100644 --- a/vllm/model_executor/models/starcoder2.py +++ b/vllm/model_executor/models/starcoder2.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright 2024 BigCode and the HuggingFace Inc. team. All rights reserved. # diff --git a/vllm/model_executor/models/telechat2.py b/vllm/model_executor/models/telechat2.py index 7d713d23c772d..f0b31b1332fb1 100644 --- a/vllm/model_executor/models/telechat2.py +++ b/vllm/model_executor/models/telechat2.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright 2023 The vLLM team. # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. diff --git a/vllm/model_executor/models/teleflm.py b/vllm/model_executor/models/teleflm.py index e05f23f99e979..3666f7011a997 100644 --- a/vllm/model_executor/models/teleflm.py +++ b/vllm/model_executor/models/teleflm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index b87a2ebf211ac..2f78d9d4cc065 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright 2024 The vLLM team. # diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index c1a4dc1b33d78..43836f2956c3b 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from https://github.com/fixie-ai/ultravox/blob/ecd58c4041030bae2ad15aa6bcf04ab43199ea02/ultravox/model/ultravox_model.py """PyTorch Ultravox model.""" diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index 3d821d3dc6b58..aa88f42101605 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import itertools from collections.abc import Iterable, Mapping diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py index 901d83ec5b9e6..ac6a659bbaa32 100644 --- a/vllm/model_executor/models/vision.py +++ b/vllm/model_executor/models/vision.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import ABC, abstractmethod from typing import Final, Generic, Optional, Protocol, TypeVar, Union diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py index c6e303d6024a4..3ee5f7dba01f0 100644 --- a/vllm/model_executor/models/whisper.py +++ b/vllm/model_executor/models/whisper.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import math from collections.abc import Iterable, Mapping, Sequence diff --git a/vllm/model_executor/models/zamba2.py b/vllm/model_executor/models/zamba2.py index 48e254bdd85bd..a4f97c774f706 100644 --- a/vllm/model_executor/models/zamba2.py +++ b/vllm/model_executor/models/zamba2.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """PyTorch Zamba2 model implementation for vLLM. This module implements the Zamba2 architecture from diff --git a/vllm/model_executor/parameter.py b/vllm/model_executor/parameter.py index 34a0b527b585e..750ee78502688 100644 --- a/vllm/model_executor/parameter.py +++ b/vllm/model_executor/parameter.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from fractions import Fraction from typing import Callable, Optional, Union diff --git a/vllm/model_executor/pooling_metadata.py b/vllm/model_executor/pooling_metadata.py index 4c5db7396c03c..4dd443bc26ea0 100644 --- a/vllm/model_executor/pooling_metadata.py +++ b/vllm/model_executor/pooling_metadata.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass from typing import Any diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py index 6b83a59b59886..56f0f0984bfa0 100644 --- a/vllm/model_executor/sampling_metadata.py +++ b/vllm/model_executor/sampling_metadata.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from array import array from dataclasses import dataclass diff --git a/vllm/model_executor/utils.py b/vllm/model_executor/utils.py index 27cea65217875..cbaa34bfc30b2 100644 --- a/vllm/model_executor/utils.py +++ b/vllm/model_executor/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Utils for model executor.""" import copy from typing import Any, Optional diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py index 815e34d5ac5db..2ef9f1ccc02be 100644 --- a/vllm/multimodal/__init__.py +++ b/vllm/multimodal/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from .base import MultiModalPlaceholderMap from .hasher import MultiModalHashDict, MultiModalHasher from .inputs import (BatchedTensorInputs, ModalityData, MultiModalDataBuiltins, diff --git a/vllm/multimodal/audio.py b/vllm/multimodal/audio.py index 1fd2ab7f87d1f..fbb29276f6bdf 100644 --- a/vllm/multimodal/audio.py +++ b/vllm/multimodal/audio.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import base64 from io import BytesIO from pathlib import Path diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py index 184c801e64d86..7188ed14c5735 100644 --- a/vllm/multimodal/base.py +++ b/vllm/multimodal/base.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import ABC, abstractmethod from collections.abc import Sequence diff --git a/vllm/multimodal/hasher.py b/vllm/multimodal/hasher.py index b4cd6a90834c0..b7988359737ac 100644 --- a/vllm/multimodal/hasher.py +++ b/vllm/multimodal/hasher.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pickle from collections.abc import Iterable, Mapping diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py index a63ec0bd8ada4..e673632d43664 100644 --- a/vllm/multimodal/image.py +++ b/vllm/multimodal/image.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import base64 from io import BytesIO diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index 600a34d39ef68..35d2a6e8c74ff 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import ABC, abstractmethod from collections import UserDict, defaultdict diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py index 63af842747a54..cae62b2235e40 100644 --- a/vllm/multimodal/parse.py +++ b/vllm/multimodal/parse.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import ABC, abstractmethod from collections import UserDict diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index aa7914e40cbff..5cfca57bffeec 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json import sys from abc import ABC, abstractmethod diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py index 53f5b243d4967..1faecb7bd24a8 100644 --- a/vllm/multimodal/profiling.py +++ b/vllm/multimodal/profiling.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import ABC, abstractmethod from collections.abc import Mapping from dataclasses import dataclass, field diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index b9f5cee922a70..27aaa661c35c8 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Mapping from dataclasses import dataclass from typing import TYPE_CHECKING, Generic, Optional, Protocol, TypeVar diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index 1d838f66f1dec..2b34cdf40b34f 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from itertools import groupby from pathlib import Path diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py index 261d56abad9c6..bedb9536e3c9c 100644 --- a/vllm/multimodal/video.py +++ b/vllm/multimodal/video.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import base64 from abc import abstractmethod diff --git a/vllm/outputs.py b/vllm/outputs.py index 3960388bf73c6..891305eb7936e 100644 --- a/vllm/outputs.py +++ b/vllm/outputs.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import time from collections.abc import MutableSequence diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py index 00d00d05f47ae..13453d2c4b4b2 100644 --- a/vllm/platforms/__init__.py +++ b/vllm/platforms/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import logging import traceback diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index eaffaac78cce9..2739f5c8c6900 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os import sys diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 9f833cbb587d8..e2d9424dee280 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Code inside this file can safely assume cuda platform, e.g. importing pynvml. However, it should not initialize cuda context. """ diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py index a8dd7df9f2e3e..3cf28950190c8 100644 --- a/vllm/platforms/hpu.py +++ b/vllm/platforms/hpu.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os from typing import TYPE_CHECKING, Optional diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index c7a6272623576..1ec9c78a361af 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import enum import os import platform diff --git a/vllm/platforms/neuron.py b/vllm/platforms/neuron.py index 56f204e71da17..04e918d7aebee 100644 --- a/vllm/platforms/neuron.py +++ b/vllm/platforms/neuron.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import enum import os from functools import lru_cache diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index ef1c632a53989..a929366db49cc 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os from datetime import timedelta diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py index 0173b15697cfe..07e52017f5a53 100644 --- a/vllm/platforms/tpu.py +++ b/vllm/platforms/tpu.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import TYPE_CHECKING, Optional, Union, cast diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py index b2a6ad5d77db6..73f6f3d417671 100644 --- a/vllm/platforms/xpu.py +++ b/vllm/platforms/xpu.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import TYPE_CHECKING, Optional diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py index 4cd3552f8a552..2cb177b9ba789 100644 --- a/vllm/plugins/__init__.py +++ b/vllm/plugins/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import logging import os diff --git a/vllm/plugins/lora_resolvers/filesystem_resolver.py b/vllm/plugins/lora_resolvers/filesystem_resolver.py index 219231f777852..b999d07a6eb74 100644 --- a/vllm/plugins/lora_resolvers/filesystem_resolver.py +++ b/vllm/plugins/lora_resolvers/filesystem_resolver.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json import os from typing import Optional diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py index 9a3b254f9b68c..322f9ed3efa9f 100644 --- a/vllm/pooling_params.py +++ b/vllm/pooling_params.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import TYPE_CHECKING, Any, Optional diff --git a/vllm/profiler/layerwise_profile.py b/vllm/profiler/layerwise_profile.py index 6934d328a87ef..2f9ebe531cbb1 100644 --- a/vllm/profiler/layerwise_profile.py +++ b/vllm/profiler/layerwise_profile.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import copy from collections import defaultdict diff --git a/vllm/profiler/utils.py b/vllm/profiler/utils.py index b26fd4dd8c071..9f0f56a15fd53 100644 --- a/vllm/profiler/utils.py +++ b/vllm/profiler/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import dataclasses from typing import Callable, Union diff --git a/vllm/prompt_adapter/layers.py b/vllm/prompt_adapter/layers.py index c2f9f16919b7f..b5b925d042f23 100644 --- a/vllm/prompt_adapter/layers.py +++ b/vllm/prompt_adapter/layers.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass from typing import Optional diff --git a/vllm/prompt_adapter/models.py b/vllm/prompt_adapter/models.py index 795591606f259..864b50c861e19 100644 --- a/vllm/prompt_adapter/models.py +++ b/vllm/prompt_adapter/models.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import logging import math diff --git a/vllm/prompt_adapter/request.py b/vllm/prompt_adapter/request.py index dfb8e61d786a0..3ce50d0a26bb0 100644 --- a/vllm/prompt_adapter/request.py +++ b/vllm/prompt_adapter/request.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import msgspec diff --git a/vllm/prompt_adapter/utils.py b/vllm/prompt_adapter/utils.py index dd179ab938f83..ddd007868f6bf 100644 --- a/vllm/prompt_adapter/utils.py +++ b/vllm/prompt_adapter/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # code borrowed from: https://github.com/huggingface/peft/blob/v0.12.0/src/peft/utils/save_and_load.py#L420 diff --git a/vllm/prompt_adapter/worker_manager.py b/vllm/prompt_adapter/worker_manager.py index 28dcc16871120..56265de8087c0 100644 --- a/vllm/prompt_adapter/worker_manager.py +++ b/vllm/prompt_adapter/worker_manager.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import logging from typing import Any, Optional, Set, Type diff --git a/vllm/reasoning/__init__.py b/vllm/reasoning/__init__.py index 65606ce55af72..e8cd565519f36 100644 --- a/vllm/reasoning/__init__.py +++ b/vllm/reasoning/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from .abs_reasoning_parsers import ReasoningParser, ReasoningParserManager from .deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser diff --git a/vllm/reasoning/abs_reasoning_parsers.py b/vllm/reasoning/abs_reasoning_parsers.py index 9dd5191da9184..e827d381ca1d2 100644 --- a/vllm/reasoning/abs_reasoning_parsers.py +++ b/vllm/reasoning/abs_reasoning_parsers.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from __future__ import annotations diff --git a/vllm/reasoning/deepseek_r1_reasoning_parser.py b/vllm/reasoning/deepseek_r1_reasoning_parser.py index 1c283c092a28c..1a5ca46a60f1d 100644 --- a/vllm/reasoning/deepseek_r1_reasoning_parser.py +++ b/vllm/reasoning/deepseek_r1_reasoning_parser.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Sequence from typing import Optional, Union diff --git a/vllm/reasoning/granite_reasoning_parser.py b/vllm/reasoning/granite_reasoning_parser.py index 07a63e294df49..5820001b918f6 100644 --- a/vllm/reasoning/granite_reasoning_parser.py +++ b/vllm/reasoning/granite_reasoning_parser.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Sequence from typing import Optional, Union diff --git a/vllm/reasoning/qwen3_reasoning_parser.py b/vllm/reasoning/qwen3_reasoning_parser.py index 7095034b1ca17..61bafc724c17f 100644 --- a/vllm/reasoning/qwen3_reasoning_parser.py +++ b/vllm/reasoning/qwen3_reasoning_parser.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Sequence from typing import Optional, Union diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py index 4294465f68fcf..7abdcecca4746 100644 --- a/vllm/sampling_params.py +++ b/vllm/sampling_params.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Sampling parameters for text generation.""" import copy from dataclasses import dataclass diff --git a/vllm/scalar_type.py b/vllm/scalar_type.py index fc1761c84cd11..9060b55c79b01 100644 --- a/vllm/scalar_type.py +++ b/vllm/scalar_type.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import functools import struct diff --git a/vllm/scripts.py b/vllm/scripts.py index 7e569d2d24fd6..7a7fdccf0a32b 100644 --- a/vllm/scripts.py +++ b/vllm/scripts.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm.entrypoints.cli.main import main as vllm_main from vllm.logger import init_logger diff --git a/vllm/sequence.py b/vllm/sequence.py index d359f897da25e..ffe890eb2dab4 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Sequence and its related classes.""" import copy import enum diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py index e08ed742a5225..f9b882469a4df 100644 --- a/vllm/spec_decode/batch_expansion.py +++ b/vllm/spec_decode/batch_expansion.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from array import array from itertools import chain, count diff --git a/vllm/spec_decode/draft_model_runner.py b/vllm/spec_decode/draft_model_runner.py index 991d2040a878a..8ccfefea1acbd 100644 --- a/vllm/spec_decode/draft_model_runner.py +++ b/vllm/spec_decode/draft_model_runner.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import List, Optional diff --git a/vllm/spec_decode/interfaces.py b/vllm/spec_decode/interfaces.py index dd085ad776384..70ec1590e7ad0 100644 --- a/vllm/spec_decode/interfaces.py +++ b/vllm/spec_decode/interfaces.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import ABC, abstractmethod from dataclasses import dataclass diff --git a/vllm/spec_decode/medusa_worker.py b/vllm/spec_decode/medusa_worker.py index 0b62a988e8b26..82b5a79fa7cb9 100644 --- a/vllm/spec_decode/medusa_worker.py +++ b/vllm/spec_decode/medusa_worker.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import weakref from typing import List, Optional, Set, Tuple diff --git a/vllm/spec_decode/metrics.py b/vllm/spec_decode/metrics.py index 4430da26c0493..a4784cad962d0 100644 --- a/vllm/spec_decode/metrics.py +++ b/vllm/spec_decode/metrics.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import time from typing import Callable, Optional, Union diff --git a/vllm/spec_decode/mlp_speculator_worker.py b/vllm/spec_decode/mlp_speculator_worker.py index bdaf31895e25d..8e8c05d26361b 100644 --- a/vllm/spec_decode/mlp_speculator_worker.py +++ b/vllm/spec_decode/mlp_speculator_worker.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import List, Optional, Set, Tuple diff --git a/vllm/spec_decode/mqa_scorer.py b/vllm/spec_decode/mqa_scorer.py index 6275c460ecefa..18e7b055a6782 100644 --- a/vllm/spec_decode/mqa_scorer.py +++ b/vllm/spec_decode/mqa_scorer.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm.sequence import (ExecuteModelRequest, SequenceData, SequenceGroupMetadata, get_all_seq_ids) diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index de57403d1b50e..4a9bbe44d89a0 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import copy import weakref diff --git a/vllm/spec_decode/ngram_worker.py b/vllm/spec_decode/ngram_worker.py index 57ae173af6744..7a1a0e56dc00b 100644 --- a/vllm/spec_decode/ngram_worker.py +++ b/vllm/spec_decode/ngram_worker.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import weakref from typing import List, Optional, Set, Tuple diff --git a/vllm/spec_decode/proposer_worker_base.py b/vllm/spec_decode/proposer_worker_base.py index 2829d631b49ee..fb44275aa9357 100644 --- a/vllm/spec_decode/proposer_worker_base.py +++ b/vllm/spec_decode/proposer_worker_base.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import ABC, abstractmethod from typing import List, Optional, Set, Tuple diff --git a/vllm/spec_decode/smaller_tp_proposer_worker.py b/vllm/spec_decode/smaller_tp_proposer_worker.py index ea3d91d7893bb..91256cab6e799 100644 --- a/vllm/spec_decode/smaller_tp_proposer_worker.py +++ b/vllm/spec_decode/smaller_tp_proposer_worker.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import List, Optional, Set, Tuple diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 252c80957305b..7dda1cbfe2302 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import copy from collections import defaultdict diff --git a/vllm/spec_decode/target_model_runner.py b/vllm/spec_decode/target_model_runner.py index 08e773c562bf8..ca89eb60ac583 100644 --- a/vllm/spec_decode/target_model_runner.py +++ b/vllm/spec_decode/target_model_runner.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import List, Optional diff --git a/vllm/spec_decode/top1_proposer.py b/vllm/spec_decode/top1_proposer.py index b538923c03e74..afd91b42b9433 100644 --- a/vllm/spec_decode/top1_proposer.py +++ b/vllm/spec_decode/top1_proposer.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import List, Optional, Set, Tuple diff --git a/vllm/spec_decode/util.py b/vllm/spec_decode/util.py index 466269b2107f5..22d2a4833acf9 100644 --- a/vllm/spec_decode/util.py +++ b/vllm/spec_decode/util.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import time from contextlib import contextmanager diff --git a/vllm/test_utils.py b/vllm/test_utils.py index f8cec380f336e..c6b126d002b2d 100644 --- a/vllm/test_utils.py +++ b/vllm/test_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project MODELS_ON_S3 = [ "adept/fuyu-8b", "ai21labs/AI21-Jamba-1.5-Mini", diff --git a/vllm/third_party/pynvml.py b/vllm/third_party/pynvml.py index 7ed9ced0e2620..d215e5d8bf657 100644 --- a/vllm/third_party/pynvml.py +++ b/vllm/third_party/pynvml.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # copied from https://pypi.org/project/nvidia-ml-py # version 12.570.86 diff --git a/vllm/tracing.py b/vllm/tracing.py index 557ae40b87aee..6a287d82be5ff 100644 --- a/vllm/tracing.py +++ b/vllm/tracing.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os from collections.abc import Mapping diff --git a/vllm/transformers_utils/__init__.py b/vllm/transformers_utils/__init__.py index 84bd7a7476564..6d4231baca50b 100644 --- a/vllm/transformers_utils/__init__.py +++ b/vllm/transformers_utils/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm import envs diff --git a/vllm/transformers_utils/chat_templates/__init__.py b/vllm/transformers_utils/chat_templates/__init__.py index fe2bd3ca41253..2783d12a22147 100644 --- a/vllm/transformers_utils/chat_templates/__init__.py +++ b/vllm/transformers_utils/chat_templates/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from .registry import get_chat_template_fallback_path __all__ = ["get_chat_template_fallback_path"] diff --git a/vllm/transformers_utils/chat_templates/registry.py b/vllm/transformers_utils/chat_templates/registry.py index 853fed5d4409d..e0ef7f0999d47 100644 --- a/vllm/transformers_utils/chat_templates/registry.py +++ b/vllm/transformers_utils/chat_templates/registry.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from pathlib import Path from typing import Callable, Optional, Union diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 8774f95a2f60b..9bc3b8e09ada7 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import enum import json diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py index ed10c22c84f08..7edff455f2992 100644 --- a/vllm/transformers_utils/configs/__init__.py +++ b/vllm/transformers_utils/configs/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm.transformers_utils.configs.chatglm import ChatGLMConfig from vllm.transformers_utils.configs.cohere2 import Cohere2Config diff --git a/vllm/transformers_utils/configs/arctic.py b/vllm/transformers_utils/configs/arctic.py index 2261f0a9e9aac..a789b93b5edff 100644 --- a/vllm/transformers_utils/configs/arctic.py +++ b/vllm/transformers_utils/configs/arctic.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # yapf: disable # ruff: noqa: E501 diff --git a/vllm/transformers_utils/configs/chatglm.py b/vllm/transformers_utils/configs/chatglm.py index 43e9503ffe03f..7c5de3e948ed7 100644 --- a/vllm/transformers_utils/configs/chatglm.py +++ b/vllm/transformers_utils/configs/chatglm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/THUDM/ChatGLM2-6B diff --git a/vllm/transformers_utils/configs/cohere2.py b/vllm/transformers_utils/configs/cohere2.py index 21328d7675b82..e547a9c281cff 100644 --- a/vllm/transformers_utils/configs/cohere2.py +++ b/vllm/transformers_utils/configs/cohere2.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # ruff: noqa diff --git a/vllm/transformers_utils/configs/dbrx.py b/vllm/transformers_utils/configs/dbrx.py index bffa127fecb25..7dbda99f85a4e 100644 --- a/vllm/transformers_utils/configs/dbrx.py +++ b/vllm/transformers_utils/configs/dbrx.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # yapf: disable # ruff: noqa: E501 diff --git a/vllm/transformers_utils/configs/deepseek_vl2.py b/vllm/transformers_utils/configs/deepseek_vl2.py index a54486fa41cd1..957d638318410 100644 --- a/vllm/transformers_utils/configs/deepseek_vl2.py +++ b/vllm/transformers_utils/configs/deepseek_vl2.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/faf18023f24b962b32d9f0a2d89e402a8d383a78/deepseek_vl2/models/modeling_deepseek_vl_v2.py#L115-L268 diff --git a/vllm/transformers_utils/configs/eagle.py b/vllm/transformers_utils/configs/eagle.py index a43e4746cb6c6..fb2e8a1df7052 100644 --- a/vllm/transformers_utils/configs/eagle.py +++ b/vllm/transformers_utils/configs/eagle.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os from typing import Optional, Union diff --git a/vllm/transformers_utils/configs/exaone.py b/vllm/transformers_utils/configs/exaone.py index 25bafbb85d306..7450904a15caf 100644 --- a/vllm/transformers_utils/configs/exaone.py +++ b/vllm/transformers_utils/configs/exaone.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copied from # https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/blob/main/configuration_exaone.py diff --git a/vllm/transformers_utils/configs/falcon.py b/vllm/transformers_utils/configs/falcon.py index f161a06f34238..2f5400463d91a 100644 --- a/vllm/transformers_utils/configs/falcon.py +++ b/vllm/transformers_utils/configs/falcon.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://huggingface.co/tiiuae/falcon-7b/blob/main/configuration_RW.py diff --git a/vllm/transformers_utils/configs/h2ovl.py b/vllm/transformers_utils/configs/h2ovl.py index 48b5d79ff950b..b36a6dd59d3d3 100644 --- a/vllm/transformers_utils/configs/h2ovl.py +++ b/vllm/transformers_utils/configs/h2ovl.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://huggingface.co/h2oai/h2ovl-mississippi-2b/blob/main/configuration_h2ovl_chat.py diff --git a/vllm/transformers_utils/configs/internvl.py b/vllm/transformers_utils/configs/internvl.py index 8ea62546e2133..4494ebfef667f 100644 --- a/vllm/transformers_utils/configs/internvl.py +++ b/vllm/transformers_utils/configs/internvl.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/configuration_internvl_chat.py diff --git a/vllm/transformers_utils/configs/jais.py b/vllm/transformers_utils/configs/jais.py index b947c6a9e2b4b..767c4ddae870d 100644 --- a/vllm/transformers_utils/configs/jais.py +++ b/vllm/transformers_utils/configs/jais.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright 2023 The OpenAI Team Authors and HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. diff --git a/vllm/transformers_utils/configs/kimi_vl.py b/vllm/transformers_utils/configs/kimi_vl.py index 97ff44bb9c1c9..ae8dac0f381d6 100644 --- a/vllm/transformers_utils/configs/kimi_vl.py +++ b/vllm/transformers_utils/configs/kimi_vl.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/configuration_kimi_vl.py from typing import Optional, Union diff --git a/vllm/transformers_utils/configs/medusa.py b/vllm/transformers_utils/configs/medusa.py index 885713c5d6cd0..9ba52956a8e8e 100644 --- a/vllm/transformers_utils/configs/medusa.py +++ b/vllm/transformers_utils/configs/medusa.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os from typing import Optional, Union diff --git a/vllm/transformers_utils/configs/minimax_text_01.py b/vllm/transformers_utils/configs/minimax_text_01.py index 660e870ac62d8..e3b63dfa00371 100644 --- a/vllm/transformers_utils/configs/minimax_text_01.py +++ b/vllm/transformers_utils/configs/minimax_text_01.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ MiniMaxText01 model configuration""" from transformers.configuration_utils import PretrainedConfig diff --git a/vllm/transformers_utils/configs/minimax_vl_01.py b/vllm/transformers_utils/configs/minimax_vl_01.py index 99e0d249dc5a7..c62497192cc2a 100644 --- a/vllm/transformers_utils/configs/minimax_vl_01.py +++ b/vllm/transformers_utils/configs/minimax_vl_01.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """MiniMaxVL01 model configuration""" from transformers.configuration_utils import PretrainedConfig diff --git a/vllm/transformers_utils/configs/mllama.py b/vllm/transformers_utils/configs/mllama.py index eb77e09adca48..f0cd2d52a529e 100644 --- a/vllm/transformers_utils/configs/mllama.py +++ b/vllm/transformers_utils/configs/mllama.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from transformers.models.mllama import configuration_mllama as mllama_hf_config diff --git a/vllm/transformers_utils/configs/mlp_speculator.py b/vllm/transformers_utils/configs/mlp_speculator.py index 70f60752905cb..2fa284e5c9e8f 100644 --- a/vllm/transformers_utils/configs/mlp_speculator.py +++ b/vllm/transformers_utils/configs/mlp_speculator.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/vllm/transformers_utils/configs/moonvit.py b/vllm/transformers_utils/configs/moonvit.py index a2b4059a63efb..a6f712f3d6005 100644 --- a/vllm/transformers_utils/configs/moonvit.py +++ b/vllm/transformers_utils/configs/moonvit.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/configuration_kimi_vl.py from transformers.configuration_utils import PretrainedConfig diff --git a/vllm/transformers_utils/configs/mpt.py b/vllm/transformers_utils/configs/mpt.py index 2d52658d3973c..91316408dcd89 100644 --- a/vllm/transformers_utils/configs/mpt.py +++ b/vllm/transformers_utils/configs/mpt.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copied from # https://huggingface.co/mosaicml/mpt-7b/blob/main/configuration_mpt.py diff --git a/vllm/transformers_utils/configs/nemotron.py b/vllm/transformers_utils/configs/nemotron.py index fdf4fa2a53e57..d65b572dc7f22 100644 --- a/vllm/transformers_utils/configs/nemotron.py +++ b/vllm/transformers_utils/configs/nemotron.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright 2024 HuggingFace Inc. team. All rights reserved. # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. diff --git a/vllm/transformers_utils/configs/nvlm_d.py b/vllm/transformers_utils/configs/nvlm_d.py index 300f6e21168e5..a533720af6c66 100644 --- a/vllm/transformers_utils/configs/nvlm_d.py +++ b/vllm/transformers_utils/configs/nvlm_d.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://huggingface.co/nvidia/NVLM-D-72B/blob/main/configuration_nvlm_d.py diff --git a/vllm/transformers_utils/configs/ovis.py b/vllm/transformers_utils/configs/ovis.py index 0ec224214f067..c2728f0ed64c9 100644 --- a/vllm/transformers_utils/configs/ovis.py +++ b/vllm/transformers_utils/configs/ovis.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # yapf: disable # ruff: noqa: E501 diff --git a/vllm/transformers_utils/configs/skyworkr1v.py b/vllm/transformers_utils/configs/skyworkr1v.py index ef5f9ba85c237..33a45220e3159 100644 --- a/vllm/transformers_utils/configs/skyworkr1v.py +++ b/vllm/transformers_utils/configs/skyworkr1v.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/configuration_skywork_chat.py diff --git a/vllm/transformers_utils/configs/solar.py b/vllm/transformers_utils/configs/solar.py index 6eaf699d17bee..a83dfa40b43a5 100644 --- a/vllm/transformers_utils/configs/solar.py +++ b/vllm/transformers_utils/configs/solar.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. # diff --git a/vllm/transformers_utils/configs/telechat2.py b/vllm/transformers_utils/configs/telechat2.py index 5da6c5b4427ea..050a7851d143f 100644 --- a/vllm/transformers_utils/configs/telechat2.py +++ b/vllm/transformers_utils/configs/telechat2.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # adapted from https://www.modelscope.cn/models/TeleAI/TeleChat2-3B/resolve/master/configuration_telechat2.py """ Telechat configuration compatible with LlamaConfig. """ diff --git a/vllm/transformers_utils/configs/ultravox.py b/vllm/transformers_utils/configs/ultravox.py index 4c50724272634..62f63b02d49a4 100644 --- a/vllm/transformers_utils/configs/ultravox.py +++ b/vllm/transformers_utils/configs/ultravox.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from https://github.com/fixie-ai/ultravox/blob/ecd58c4041030bae2ad15aa6bcf04ab43199ea02/ultravox/model/ultravox_config.py from typing import Any, Optional diff --git a/vllm/transformers_utils/detokenizer.py b/vllm/transformers_utils/detokenizer.py index 3adf2e32cca7c..380c62a141f0f 100644 --- a/vllm/transformers_utils/detokenizer.py +++ b/vllm/transformers_utils/detokenizer.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/vllm/transformers_utils/detokenizer_utils.py b/vllm/transformers_utils/detokenizer_utils.py index 7373fa0ede237..342632989d579 100644 --- a/vllm/transformers_utils/detokenizer_utils.py +++ b/vllm/transformers_utils/detokenizer_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/vllm/transformers_utils/processor.py b/vllm/transformers_utils/processor.py index ce6427de432da..70cd08263d372 100644 --- a/vllm/transformers_utils/processor.py +++ b/vllm/transformers_utils/processor.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from functools import lru_cache from typing import TYPE_CHECKING, Any, Optional, Union, cast diff --git a/vllm/transformers_utils/processors/__init__.py b/vllm/transformers_utils/processors/__init__.py index 2bd9ab1f099b3..14d15f2bc1673 100644 --- a/vllm/transformers_utils/processors/__init__.py +++ b/vllm/transformers_utils/processors/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm.transformers_utils.processors.deepseek_vl2 import ( DeepseekVLV2Processor) diff --git a/vllm/transformers_utils/processors/deepseek_vl2.py b/vllm/transformers_utils/processors/deepseek_vl2.py index df960e9c7aa8f..b4669d12fa213 100644 --- a/vllm/transformers_utils/processors/deepseek_vl2.py +++ b/vllm/transformers_utils/processors/deepseek_vl2.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # yapf: disable # ruff: noqa: E501 diff --git a/vllm/transformers_utils/processors/ovis.py b/vllm/transformers_utils/processors/ovis.py index f1c6407e1f3a3..4fe76d0df622b 100644 --- a/vllm/transformers_utils/processors/ovis.py +++ b/vllm/transformers_utils/processors/ovis.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # yapf: disable # ruff: noqa: E501 diff --git a/vllm/transformers_utils/s3_utils.py b/vllm/transformers_utils/s3_utils.py index 1c3520bcfb278..f95aae7815e0b 100644 --- a/vllm/transformers_utils/s3_utils.py +++ b/vllm/transformers_utils/s3_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import fnmatch import os diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py index fa7a208c48ed7..ae96ebe4eaa26 100644 --- a/vllm/transformers_utils/tokenizer.py +++ b/vllm/transformers_utils/tokenizer.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import contextlib import copy diff --git a/vllm/transformers_utils/tokenizer_base.py b/vllm/transformers_utils/tokenizer_base.py index d69e5a6b42513..20e5fea714e70 100644 --- a/vllm/transformers_utils/tokenizer_base.py +++ b/vllm/transformers_utils/tokenizer_base.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import importlib from abc import ABC, abstractmethod diff --git a/vllm/transformers_utils/tokenizer_group.py b/vllm/transformers_utils/tokenizer_group.py index 8b9e4881ef88f..eb53cceaa0585 100644 --- a/vllm/transformers_utils/tokenizer_group.py +++ b/vllm/transformers_utils/tokenizer_group.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/vllm/transformers_utils/tokenizers/__init__.py b/vllm/transformers_utils/tokenizers/__init__.py index 7aac29a6bf967..941156c4bf50e 100644 --- a/vllm/transformers_utils/tokenizers/__init__.py +++ b/vllm/transformers_utils/tokenizers/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from .mistral import (MistralTokenizer, maybe_serialize_tool_calls, truncate_tool_call_ids, validate_request_params) diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py index 23b6f67f09df7..fcc0f538ff012 100644 --- a/vllm/transformers_utils/tokenizers/mistral.py +++ b/vllm/transformers_utils/tokenizers/mistral.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os from dataclasses import dataclass diff --git a/vllm/transformers_utils/utils.py b/vllm/transformers_utils/utils.py index 8dff1b612fdbb..66c8fb797adcd 100644 --- a/vllm/transformers_utils/utils.py +++ b/vllm/transformers_utils/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json from functools import cache diff --git a/vllm/triton_utils/__init__.py b/vllm/triton_utils/__init__.py index 9f14a907af3a5..0fcf5d15afd1d 100644 --- a/vllm/triton_utils/__init__.py +++ b/vllm/triton_utils/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm.triton_utils.importing import (HAS_TRITON, TritonLanguagePlaceholder, TritonPlaceholder) diff --git a/vllm/triton_utils/importing.py b/vllm/triton_utils/importing.py index 8cf2e01a33bd6..068fa303137c1 100644 --- a/vllm/triton_utils/importing.py +++ b/vllm/triton_utils/importing.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import types from importlib.util import find_spec diff --git a/vllm/usage/usage_lib.py b/vllm/usage/usage_lib.py index 90af0c63cc02b..c149637635b77 100644 --- a/vllm/usage/usage_lib.py +++ b/vllm/usage/usage_lib.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import datetime import json diff --git a/vllm/utils.py b/vllm/utils.py index b4152e6b24700..41336b80e3a25 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from __future__ import annotations diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index 9ed3dec7f2695..9e989df1cd892 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Attention layer with FlashAttention.""" from dataclasses import dataclass from typing import TYPE_CHECKING, Any, Optional diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index 1c4f7f62fa675..8bd998eba7695 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Attention layer with FlashInfer.""" from __future__ import annotations diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index 1edfab26b6c12..96befca5a1e94 100644 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ # MLA Common Components diff --git a/vllm/v1/attention/backends/mla/flashmla.py b/vllm/v1/attention/backends/mla/flashmla.py index e6594c6b6fa8c..060a7c9d8c853 100644 --- a/vllm/v1/attention/backends/mla/flashmla.py +++ b/vllm/v1/attention/backends/mla/flashmla.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass from typing import Any, Optional diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py index d1e823bbe3965..8925b5a5cd7d0 100644 --- a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py +++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass from typing import Any, Optional diff --git a/vllm/v1/attention/backends/mla/triton_mla.py b/vllm/v1/attention/backends/mla/triton_mla.py index 2e6b619db6287..0857fc133c431 100644 --- a/vllm/v1/attention/backends/mla/triton_mla.py +++ b/vllm/v1/attention/backends/mla/triton_mla.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any, Optional diff --git a/vllm/v1/attention/backends/pallas.py b/vllm/v1/attention/backends/pallas.py index 8187e457d9e61..896f1394cfa4b 100644 --- a/vllm/v1/attention/backends/pallas.py +++ b/vllm/v1/attention/backends/pallas.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass from typing import Any, Optional diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py index a97bb85004f6f..6a3314dd87889 100644 --- a/vllm/v1/attention/backends/triton_attn.py +++ b/vllm/v1/attention/backends/triton_attn.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Attention layer with PagedAttention and Triton prefix prefill.""" from typing import TYPE_CHECKING, Any, Optional diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py index 10a771e830b68..2e65619ed7bc8 100644 --- a/vllm/v1/attention/backends/utils.py +++ b/vllm/v1/attention/backends/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass import torch diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py index a0a065df9b1ca..27eaca49797d8 100644 --- a/vllm/v1/core/block_pool.py +++ b/vllm/v1/core/block_pool.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections import defaultdict from collections.abc import Iterable from typing import Callable, Optional diff --git a/vllm/v1/core/encoder_cache_manager.py b/vllm/v1/core/encoder_cache_manager.py index 05d70bb9b9773..16dc67b9b6f6a 100644 --- a/vllm/v1/core/encoder_cache_manager.py +++ b/vllm/v1/core/encoder_cache_manager.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import TYPE_CHECKING diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 59e07382b652f..91999d30035b9 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections import defaultdict from dataclasses import dataclass diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 3ccad97e9919b..61476362e3024 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """KV-Cache Utilities.""" import os from collections import deque diff --git a/vllm/v1/core/sched/interface.py b/vllm/v1/core/sched/interface.py index 055ce446051ef..dd5052a3480b7 100644 --- a/vllm/v1/core/sched/interface.py +++ b/vllm/v1/core/sched/interface.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import ABC, abstractmethod from collections.abc import Iterable from typing import TYPE_CHECKING, Optional, Union diff --git a/vllm/v1/core/sched/output.py b/vllm/v1/core/sched/output.py index 2572344309837..b404c70eb1e44 100644 --- a/vllm/v1/core/sched/output.py +++ b/vllm/v1/core/sched/output.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from __future__ import annotations diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index ce16a1ed5a096..e510a0626c1b4 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from __future__ import annotations diff --git a/vllm/v1/core/sched/utils.py b/vllm/v1/core/sched/utils.py index 3a0028a59016e..1397c5f4c9a6d 100644 --- a/vllm/v1/core/sched/utils.py +++ b/vllm/v1/core/sched/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm.v1.request import Request, RequestStatus diff --git a/vllm/v1/core/single_type_kv_cache_manager.py b/vllm/v1/core/single_type_kv_cache_manager.py index e69e9ac9f6a37..233c73e882398 100644 --- a/vllm/v1/core/single_type_kv_cache_manager.py +++ b/vllm/v1/core/single_type_kv_cache_manager.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import ABC, abstractmethod from collections import defaultdict from typing import Callable diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index 0c9f61a764279..d1bec25237d62 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import enum import time diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 4b235c596ed6d..0e369632156bd 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio from collections.abc import AsyncGenerator, Mapping from copy import copy diff --git a/vllm/v1/engine/coordinator.py b/vllm/v1/engine/coordinator.py index b84d4b144b5f2..4f6ba099c650c 100644 --- a/vllm/v1/engine/coordinator.py +++ b/vllm/v1/engine/coordinator.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import multiprocessing import time import weakref diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 7253d1dc66d1f..f36a491a19702 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os import queue import signal diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index fa01998aa9fe2..adb0709c828a7 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio import contextlib import queue diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index dca327cc5d07b..c6fe2d339c93d 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import ABC, abstractmethod from typing import Optional diff --git a/vllm/v1/engine/exceptions.py b/vllm/v1/engine/exceptions.py index 97dd31d5e5218..692ba9dc840f8 100644 --- a/vllm/v1/engine/exceptions.py +++ b/vllm/v1/engine/exceptions.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project class EngineGenerateError(Exception): """Raised when a AsyncLLM.generate() fails. Recoverable.""" pass diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index c856e2645a2c9..736ffd8b40f00 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Mapping from copy import copy diff --git a/vllm/v1/engine/logprobs.py b/vllm/v1/engine/logprobs.py index 03d82b6bbc1d6..edc3be5b0120e 100644 --- a/vllm/v1/engine/logprobs.py +++ b/vllm/v1/engine/logprobs.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import itertools from collections.abc import Iterable diff --git a/vllm/v1/engine/mm_input_cache.py b/vllm/v1/engine/mm_input_cache.py index 45fb5cd23f60f..abe98a13dfd3e 100644 --- a/vllm/v1/engine/mm_input_cache.py +++ b/vllm/v1/engine/mm_input_cache.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Sequence from typing import Optional diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py index 293c291b43410..1dcfbab30cfb3 100644 --- a/vllm/v1/engine/output_processor.py +++ b/vllm/v1/engine/output_processor.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio from collections.abc import Iterable diff --git a/vllm/v1/engine/parallel_sampling.py b/vllm/v1/engine/parallel_sampling.py index 4df7ca59731ec..1e9911152c6df 100644 --- a/vllm/v1/engine/parallel_sampling.py +++ b/vllm/v1/engine/parallel_sampling.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from copy import copy from typing import Optional diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 64a756148780d..5c0d01d9b6f61 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import time from collections.abc import Mapping, Sequence diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py index 3b9feb0d32980..50b9634a49e1b 100644 --- a/vllm/v1/executor/abstract.py +++ b/vllm/v1/executor/abstract.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from concurrent.futures import Future from typing import Callable, Union diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index eb5f9d4bfe004..0bd7383b5f0e4 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import multiprocessing import os import pickle diff --git a/vllm/v1/executor/ray_distributed_executor.py b/vllm/v1/executor/ray_distributed_executor.py index 320ebfd37ae37..257564793cf4e 100644 --- a/vllm/v1/executor/ray_distributed_executor.py +++ b/vllm/v1/executor/ray_distributed_executor.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from concurrent.futures import Future from typing import Union diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py index 2747fc7fabd1e..cf2eb3b955691 100644 --- a/vllm/v1/kv_cache_interface.py +++ b/vllm/v1/kv_cache_interface.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import copy from dataclasses import dataclass diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index 665e5873d5891..2d621ec31038f 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import logging import time diff --git a/vllm/v1/metrics/prometheus.py b/vllm/v1/metrics/prometheus.py index a364b286d21b9..61ba5d66cb31a 100644 --- a/vllm/v1/metrics/prometheus.py +++ b/vllm/v1/metrics/prometheus.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os import tempfile diff --git a/vllm/v1/metrics/ray_wrappers.py b/vllm/v1/metrics/ray_wrappers.py index a51c3ed7f5720..18c8dcf0a0d35 100644 --- a/vllm/v1/metrics/ray_wrappers.py +++ b/vllm/v1/metrics/ray_wrappers.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import time from typing import Optional, Union diff --git a/vllm/v1/metrics/reader.py b/vllm/v1/metrics/reader.py index 5ab78129a0094..4d6e599841541 100644 --- a/vllm/v1/metrics/reader.py +++ b/vllm/v1/metrics/reader.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass from typing import Optional diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py index 8fe1630616a47..50c8b07fe54d2 100644 --- a/vllm/v1/metrics/stats.py +++ b/vllm/v1/metrics/stats.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import time from dataclasses import dataclass, field diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py index e8ce0df5ed8d2..17a299d57cbaa 100644 --- a/vllm/v1/outputs.py +++ b/vllm/v1/outputs.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass from typing import NamedTuple, Optional diff --git a/vllm/v1/request.py b/vllm/v1/request.py index 42c75ef964016..53fd70fabecf3 100644 --- a/vllm/v1/request.py +++ b/vllm/v1/request.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import enum from typing import TYPE_CHECKING, Any, Optional, Union diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py index e97e1235fb365..ab13b288a5a9b 100644 --- a/vllm/v1/sample/metadata.py +++ b/vllm/v1/sample/metadata.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass from typing import Optional diff --git a/vllm/v1/sample/ops/bad_words.py b/vllm/v1/sample/ops/bad_words.py index 2984d4e4806fe..1b699565f26f2 100644 --- a/vllm/v1/sample/ops/bad_words.py +++ b/vllm/v1/sample/ops/bad_words.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch diff --git a/vllm/v1/sample/ops/penalties.py b/vllm/v1/sample/ops/penalties.py index ed05e3f48401a..48423b9b424dd 100644 --- a/vllm/v1/sample/ops/penalties.py +++ b/vllm/v1/sample/ops/penalties.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py index 4a5fbb10d408b..30396f1594337 100644 --- a/vllm/v1/sample/ops/topk_topp_sampler.py +++ b/vllm/v1/sample/ops/topk_topp_sampler.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/vllm/v1/sample/rejection_sampler.py b/vllm/v1/sample/rejection_sampler.py index 17b870fede8e7..b2354c53302ad 100644 --- a/vllm/v1/sample/rejection_sampler.py +++ b/vllm/v1/sample/rejection_sampler.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional import torch diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py index 16561d30a6dc3..8ba3c2087a5cb 100644 --- a/vllm/v1/sample/sampler.py +++ b/vllm/v1/sample/sampler.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """A layer that samples the next tokens from the model's outputs.""" import torch diff --git a/vllm/v1/sample/tpu/metadata.py b/vllm/v1/sample/tpu/metadata.py index a1c7dcdb111f5..4c1ac4895197c 100644 --- a/vllm/v1/sample/tpu/metadata.py +++ b/vllm/v1/sample/tpu/metadata.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass, field from typing import Optional diff --git a/vllm/v1/sample/tpu/sampler.py b/vllm/v1/sample/tpu/sampler.py index 7c31a2984b307..1056eb1d7b7fe 100644 --- a/vllm/v1/sample/tpu/sampler.py +++ b/vllm/v1/sample/tpu/sampler.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Sampler layer implementing TPU supported operations.""" import torch diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py index 78f37c1e8b218..ab6653a786ffe 100644 --- a/vllm/v1/serial_utils.py +++ b/vllm/v1/serial_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import dataclasses import pickle diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 1ca8564231659..416bc8af18ab5 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch import torch.nn as nn diff --git a/vllm/v1/spec_decode/medusa.py b/vllm/v1/spec_decode/medusa.py index fdac2ef64c3f7..f516bf486b8b5 100644 --- a/vllm/v1/spec_decode/medusa.py +++ b/vllm/v1/spec_decode/medusa.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch import torch.nn as nn diff --git a/vllm/v1/spec_decode/metadata.py b/vllm/v1/spec_decode/metadata.py index 1cf650d5fa569..b1efb40612d54 100644 --- a/vllm/v1/spec_decode/metadata.py +++ b/vllm/v1/spec_decode/metadata.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass import numpy as np diff --git a/vllm/v1/spec_decode/metrics.py b/vllm/v1/spec_decode/metrics.py index 36091bef28959..b4bc3058c570a 100644 --- a/vllm/v1/spec_decode/metrics.py +++ b/vllm/v1/spec_decode/metrics.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass, field from typing import Optional diff --git a/vllm/v1/spec_decode/ngram_proposer.py b/vllm/v1/spec_decode/ngram_proposer.py index 704153d43a2b4..6b90d0970bd77 100644 --- a/vllm/v1/spec_decode/ngram_proposer.py +++ b/vllm/v1/spec_decode/ngram_proposer.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional import numpy as np diff --git a/vllm/v1/spec_decode/utils.py b/vllm/v1/spec_decode/utils.py index 334258e7f87ae..5c37333cebc7a 100644 --- a/vllm/v1/spec_decode/utils.py +++ b/vllm/v1/spec_decode/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm.triton_utils import tl, triton from vllm.v1.worker.gpu_input_batch import InputBatch diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py index 07b422814e13a..b2b0ee7969543 100644 --- a/vllm/v1/structured_output/__init__.py +++ b/vllm/v1/structured_output/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from __future__ import annotations import multiprocessing diff --git a/vllm/v1/structured_output/backend_guidance.py b/vllm/v1/structured_output/backend_guidance.py index 55c5f609095d7..02e7fc33f517d 100644 --- a/vllm/v1/structured_output/backend_guidance.py +++ b/vllm/v1/structured_output/backend_guidance.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from __future__ import annotations diff --git a/vllm/v1/structured_output/backend_types.py b/vllm/v1/structured_output/backend_types.py index 09f6cdf733372..d500783aa4b30 100644 --- a/vllm/v1/structured_output/backend_types.py +++ b/vllm/v1/structured_output/backend_types.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from __future__ import annotations diff --git a/vllm/v1/structured_output/backend_xgrammar.py b/vllm/v1/structured_output/backend_xgrammar.py index f2570221da252..88544565e5443 100644 --- a/vllm/v1/structured_output/backend_xgrammar.py +++ b/vllm/v1/structured_output/backend_xgrammar.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from __future__ import annotations diff --git a/vllm/v1/structured_output/request.py b/vllm/v1/structured_output/request.py index 9a7e30d41aaa8..fc365f12573fc 100644 --- a/vllm/v1/structured_output/request.py +++ b/vllm/v1/structured_output/request.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from __future__ import annotations import dataclasses diff --git a/vllm/v1/structured_output/utils.py b/vllm/v1/structured_output/utils.py index 111e92dc0990d..7adee7237bd12 100644 --- a/vllm/v1/structured_output/utils.py +++ b/vllm/v1/structured_output/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from __future__ import annotations diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py index d347efc425ef4..5b497e66c4bf3 100644 --- a/vllm/v1/utils.py +++ b/vllm/v1/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import multiprocessing diff --git a/vllm/v1/worker/block_table.py b/vllm/v1/worker/block_table.py index 576086ebeb7f7..958262c492462 100644 --- a/vllm/v1/worker/block_table.py +++ b/vllm/v1/worker/block_table.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import numpy as np import torch diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index b3e65917d3cc2..bb986b6047f65 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Datastructures defining an input batch from dataclasses import dataclass diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 9f7c474c71cbc..c96ad0c015301 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import copy import gc diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index dd06e729673ff..f36cf5d5c3191 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """A GPU worker class.""" import gc import os diff --git a/vllm/v1/worker/lora_model_runner_mixin.py b/vllm/v1/worker/lora_model_runner_mixin.py index eb8ed622161d5..afa41a37eeb34 100644 --- a/vllm/v1/worker/lora_model_runner_mixin.py +++ b/vllm/v1/worker/lora_model_runner_mixin.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Define LoRA functionality mixin for model runners. """ diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index c5171b9736b36..48ea3cb7bff0d 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import bisect import gc import time diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py index bf0a5777cb3ff..8d2f8112d2d7e 100644 --- a/vllm/v1/worker/tpu_worker.py +++ b/vllm/v1/worker/tpu_worker.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """A TPU worker class.""" import os from typing import Optional diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index 91548a52cfc70..b23b28c1d7e9c 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional import torch diff --git a/vllm/v1/worker/worker_base.py b/vllm/v1/worker/worker_base.py index 487a49b6211e2..9c93754f93f81 100644 --- a/vllm/v1/worker/worker_base.py +++ b/vllm/v1/worker/worker_base.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/vllm/version.py b/vllm/version.py index 8329d7becb683..6c88b1b5a3bf4 100644 --- a/vllm/version.py +++ b/vllm/version.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project try: from ._version import __version__, __version_tuple__ diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py index d48a6957c5dda..530907012f704 100644 --- a/vllm/worker/cache_engine.py +++ b/vllm/worker/cache_engine.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """CacheEngine class for managing the KV cache.""" from typing import List diff --git a/vllm/worker/cpu_enc_dec_model_runner.py b/vllm/worker/cpu_enc_dec_model_runner.py index 82eeeb570d222..677d66357a7fa 100644 --- a/vllm/worker/cpu_enc_dec_model_runner.py +++ b/vllm/worker/cpu_enc_dec_model_runner.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import dataclasses from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, cast diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py index fb436a079f878..6213cf760ac55 100644 --- a/vllm/worker/cpu_model_runner.py +++ b/vllm/worker/cpu_model_runner.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import dataclasses import weakref diff --git a/vllm/worker/cpu_pooling_model_runner.py b/vllm/worker/cpu_pooling_model_runner.py index 2a60e51261ad6..174f86f48b568 100644 --- a/vllm/worker/cpu_pooling_model_runner.py +++ b/vllm/worker/cpu_pooling_model_runner.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import dataclasses from typing import Any, Dict, List, Optional, Tuple, Type, Union diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py index 1436a404335a0..b04a9a1eb08d1 100644 --- a/vllm/worker/cpu_worker.py +++ b/vllm/worker/cpu_worker.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """A CPU worker class.""" import os from typing import Dict, List, Optional, Set, Tuple, Type diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py index 3957e5608524f..a3e7b0147961c 100644 --- a/vllm/worker/enc_dec_model_runner.py +++ b/vllm/worker/enc_dec_model_runner.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import dataclasses import itertools diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index e2261cbb26b44..17123d2b48375 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project ############################################################################### # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company diff --git a/vllm/worker/hpu_worker.py b/vllm/worker/hpu_worker.py index 533fead0e669e..6d76ea499a901 100644 --- a/vllm/worker/hpu_worker.py +++ b/vllm/worker/hpu_worker.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project ############################################################################### # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 8c968faa78101..75501e0f748ab 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import dataclasses import gc diff --git a/vllm/worker/model_runner_base.py b/vllm/worker/model_runner_base.py index 935325cb2e1c0..d567ce4a6e78f 100644 --- a/vllm/worker/model_runner_base.py +++ b/vllm/worker/model_runner_base.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import dataclasses from abc import ABC, abstractmethod diff --git a/vllm/worker/multi_step_hpu_worker.py b/vllm/worker/multi_step_hpu_worker.py index 2c5e2eac75898..f0210c13c7553 100644 --- a/vllm/worker/multi_step_hpu_worker.py +++ b/vllm/worker/multi_step_hpu_worker.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project ############################################################################### # Copyright (C) 2025 Habana Labs, Ltd. an Intel Company diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py index f8d5acf586c51..cc0cc855e7be4 100644 --- a/vllm/worker/multi_step_model_runner.py +++ b/vllm/worker/multi_step_model_runner.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import dataclasses import functools diff --git a/vllm/worker/multi_step_neuron_model_runner.py b/vllm/worker/multi_step_neuron_model_runner.py index aafb7ab7cfb8d..336e41649df58 100644 --- a/vllm/worker/multi_step_neuron_model_runner.py +++ b/vllm/worker/multi_step_neuron_model_runner.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from importlib.util import find_spec from typing import List, Optional diff --git a/vllm/worker/multi_step_neuronx_distributed_model_runner.py b/vllm/worker/multi_step_neuronx_distributed_model_runner.py index 3a9c0993e004f..de9827723eecf 100644 --- a/vllm/worker/multi_step_neuronx_distributed_model_runner.py +++ b/vllm/worker/multi_step_neuronx_distributed_model_runner.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import List, Optional import torch diff --git a/vllm/worker/multi_step_tpu_worker.py b/vllm/worker/multi_step_tpu_worker.py index 3871199987cee..ed9f001666159 100644 --- a/vllm/worker/multi_step_tpu_worker.py +++ b/vllm/worker/multi_step_tpu_worker.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import dataclasses from typing import Dict, Optional, Tuple diff --git a/vllm/worker/multi_step_worker.py b/vllm/worker/multi_step_worker.py index 3518ab2f64fed..ea16e14f9ecd4 100644 --- a/vllm/worker/multi_step_worker.py +++ b/vllm/worker/multi_step_worker.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import dataclasses from dataclasses import dataclass diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py index 3aff3e01aef16..28855bb4698bc 100644 --- a/vllm/worker/neuron_model_runner.py +++ b/vllm/worker/neuron_model_runner.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os from dataclasses import dataclass diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py index 64daee31bbdf5..662bde6bc07b0 100644 --- a/vllm/worker/neuron_worker.py +++ b/vllm/worker/neuron_worker.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """A Neuron worker class.""" import os from typing import List, Optional, Set, Tuple diff --git a/vllm/worker/neuronx_distributed_model_runner.py b/vllm/worker/neuronx_distributed_model_runner.py index 9cd4f88d32f06..2a0f4e77c99e5 100644 --- a/vllm/worker/neuronx_distributed_model_runner.py +++ b/vllm/worker/neuronx_distributed_model_runner.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import List, Optional, Set diff --git a/vllm/worker/pooling_model_runner.py b/vllm/worker/pooling_model_runner.py index 912e04c435f54..be6b3d1379fdc 100644 --- a/vllm/worker/pooling_model_runner.py +++ b/vllm/worker/pooling_model_runner.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import dataclasses from typing import Any, Dict, List, Optional, Tuple, Type, Union diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py index e0cca90727458..5f1535271b9ac 100644 --- a/vllm/worker/tpu_model_runner.py +++ b/vllm/worker/tpu_model_runner.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import enum import time diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py index 4bb9bea022f99..ad5ed19e2f894 100644 --- a/vllm/worker/tpu_worker.py +++ b/vllm/worker/tpu_worker.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os from typing import List, Optional, Tuple, Union diff --git a/vllm/worker/utils.py b/vllm/worker/utils.py index e2854bcb37cef..1a5f62cb3c471 100644 --- a/vllm/worker/utils.py +++ b/vllm/worker/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project ''' Worker-related helper functions. ''' diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 2a43172719342..9a928632688a1 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """A GPU worker class.""" import gc import os diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index e5662e69343c6..db1ca2d8ff30a 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import dataclasses import os diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py index 79fa7d2c73e88..ecbb63d912766 100644 --- a/vllm/worker/xpu_model_runner.py +++ b/vllm/worker/xpu_model_runner.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import dataclasses import time diff --git a/vllm/worker/xpu_worker.py b/vllm/worker/xpu_worker.py index a5109a982cbfe..fe321c059f526 100644 --- a/vllm/worker/xpu_worker.py +++ b/vllm/worker/xpu_worker.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """A XPU worker class.""" import gc import os From 19bdaf32b139656627c8b311361a0fa38ae98f4b Mon Sep 17 00:00:00 2001 From: SorenDreano <71752785+SorenDreano@users.noreply.github.com> Date: Tue, 3 Jun 2025 20:50:55 +0200 Subject: [PATCH 026/115] [Doc] Readme standardization (#18695) Co-authored-by: Soren Dreano --- README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 67f6b957ec55a..ec16d758327d4 100644 --- a/README.md +++ b/README.md @@ -58,8 +58,8 @@ vLLM is fast with: - Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html) - Continuous batching of incoming requests - Fast model execution with CUDA/HIP graph -- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [AutoRound](https://arxiv.org/abs/2309.05516),INT4, INT8, and FP8. -- Optimized CUDA kernels, including integration with FlashAttention and FlashInfer. +- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [AutoRound](https://arxiv.org/abs/2309.05516), INT4, INT8, and FP8 +- Optimized CUDA kernels, including integration with FlashAttention and FlashInfer - Speculative decoding - Chunked prefill @@ -72,14 +72,14 @@ vLLM is flexible and easy to use with: - Tensor parallelism and pipeline parallelism support for distributed inference - Streaming outputs - OpenAI-compatible API server -- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron. +- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron - Prefix caching support - Multi-LoRA support vLLM seamlessly supports most popular open-source models on HuggingFace, including: - Transformer-like LLMs (e.g., Llama) - Mixture-of-Expert LLMs (e.g., Mixtral, Deepseek-V2 and V3) -- Embedding Models (e.g. E5-Mistral) +- Embedding Models (e.g., E5-Mistral) - Multi-modal LLMs (e.g., LLaVA) Find the full list of supported models [here](https://docs.vllm.ai/en/latest/models/supported_models.html). @@ -162,4 +162,4 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs ## Media Kit -- If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit). +- If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit) From 01eee4053606458b2596818acd1fffee699ed75d Mon Sep 17 00:00:00 2001 From: Reid <61492567+reidliu41@users.noreply.github.com> Date: Wed, 4 Jun 2025 03:08:21 +0800 Subject: [PATCH 027/115] [doc] update docker version (#19074) Signed-off-by: reidliu41 Co-authored-by: reidliu41 --- docs/deployment/docker.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/deployment/docker.md b/docs/deployment/docker.md index 9e506d3d7ba38..93d9e80f5b012 100644 --- a/docs/deployment/docker.md +++ b/docs/deployment/docker.md @@ -46,11 +46,11 @@ You can add any other [engine-args][engine-args] you need after the image tag (` create a custom Dockerfile on top of the base image with an extra layer that installs them: ```Dockerfile - FROM vllm/vllm-openai:v0.8.3 + FROM vllm/vllm-openai:v0.9.0 # e.g. install the `audio` optional dependencies # NOTE: Make sure the version of vLLM matches the base image! - RUN uv pip install --system vllm[audio]==0.8.3 + RUN uv pip install --system vllm[audio]==0.9.0 ``` !!! tip From fa98d77773c649de05a4bda9847682c80287aa36 Mon Sep 17 00:00:00 2001 From: Varun Sundar Rabindranath Date: Tue, 3 Jun 2025 15:30:02 -0400 Subject: [PATCH 028/115] [Kernel] DeepEP dispatch-combine kernel integration (#18434) Signed-off-by: Varun Co-authored-by: Varun Sundar Rabindranath --- csrc/moe/topk_softmax_kernels.cu | 16 +- tests/kernels/moe/__init__.py | 0 tests/kernels/moe/deepep_utils.py | 188 +++++++ tests/kernels/moe/test_deepep_deepgemm_moe.py | 371 ++++++++++++++ tests/kernels/moe/test_deepep_moe.py | 459 ++++++++++++++++++ vllm/config.py | 2 + .../device_communicators/all2all.py | 146 +++++- .../device_communicators/cuda_communicator.py | 8 + vllm/envs.py | 2 + .../layers/fused_moe/deep_gemm_moe.py | 32 +- .../fused_moe/deepep_ht_prepare_finalize.py | 236 +++++++++ .../fused_moe/deepep_ll_prepare_finalize.py | 152 ++++++ .../layers/fused_moe/fused_batched_moe.py | 57 ++- .../layers/fused_moe/fused_moe.py | 2 +- vllm/model_executor/layers/fused_moe/layer.py | 148 ++++-- .../layers/fused_moe/modular_kernel.py | 158 ++++-- .../layers/fused_moe/moe_permute_unpermute.py | 5 +- .../layers/fused_moe/pplx_prepare_finalize.py | 11 +- .../layers/fused_moe/prepare_finalize.py | 12 +- .../layers/fused_moe/triton_deep_gemm_moe.py | 7 +- vllm/model_executor/layers/fused_moe/utils.py | 4 +- .../model_executor/layers/quantization/fp8.py | 41 +- vllm/platforms/cuda.py | 15 + 23 files changed, 1950 insertions(+), 122 deletions(-) create mode 100644 tests/kernels/moe/__init__.py create mode 100644 tests/kernels/moe/deepep_utils.py create mode 100644 tests/kernels/moe/test_deepep_deepgemm_moe.py create mode 100644 tests/kernels/moe/test_deepep_moe.py create mode 100644 vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py create mode 100644 vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py diff --git a/csrc/moe/topk_softmax_kernels.cu b/csrc/moe/topk_softmax_kernels.cu index a9379032245d9..10be47966f611 100644 --- a/csrc/moe/topk_softmax_kernels.cu +++ b/csrc/moe/topk_softmax_kernels.cu @@ -516,9 +516,8 @@ void topk_softmax( topk, stream); } - else + else if (topk_indices.scalar_type() == at::ScalarType::UInt32) { - assert(topk_indices.scalar_type() == at::ScalarType::UInt32); vllm::moe::topkGatingSoftmaxKernelLauncher( gating_output.data_ptr(), topk_weights.data_ptr(), @@ -530,4 +529,17 @@ void topk_softmax( topk, stream); } + else { + assert(topk_indices.scalar_type() == at::ScalarType::Int64); + vllm::moe::topkGatingSoftmaxKernelLauncher( + gating_output.data_ptr(), + topk_weights.data_ptr(), + topk_indices.data_ptr(), + token_expert_indices.data_ptr(), + softmax_workspace.data_ptr(), + num_tokens, + num_experts, + topk, + stream); + } } diff --git a/tests/kernels/moe/__init__.py b/tests/kernels/moe/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/kernels/moe/deepep_utils.py b/tests/kernels/moe/deepep_utils.py new file mode 100644 index 0000000000000..2bc9b657da859 --- /dev/null +++ b/tests/kernels/moe/deepep_utils.py @@ -0,0 +1,188 @@ +# SPDX-License-Identifier: Apache-2.0 +""" +DeepEP test utilities +""" +import dataclasses +import importlib +import traceback +from typing import Callable, Optional + +import torch +from torch.distributed import ProcessGroup +from torch.multiprocessing import ( + spawn) # pyright: ignore[reportPrivateImportUsage] +from typing_extensions import Concatenate, ParamSpec + +has_deep_ep = importlib.util.find_spec("deep_ep") is not None +if has_deep_ep: + from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import ( # noqa: E501 + DeepEPHTPrepareAndFinalize) + from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import ( # noqa: E501 + DeepEPLLPrepareAndFinalize) + +## Parallel Processes Utils + +P = ParamSpec("P") + + +@dataclasses.dataclass +class ProcessGroupInfo: + world_size: int + world_local_size: int + rank: int + node_rank: int + local_rank: int + device: torch.device + + +def _worker_parallel_launch( + local_rank: int, + world_size: int, + world_local_size: int, + node_rank: int, + init_method: str, + worker: Callable[Concatenate[ProcessGroupInfo, P], None], + *args: P.args, + **kwargs: P.kwargs, +) -> None: + rank = node_rank * world_local_size + local_rank + torch.cuda.set_device(local_rank) + device = torch.device("cuda", local_rank) + torch.distributed.init_process_group( + backend="cpu:gloo,cuda:nccl", + init_method=init_method, + rank=rank, + world_size=world_size, + device_id=device, + ) + barrier = torch.tensor([rank], device=device) + torch.distributed.all_reduce(barrier) + + try: + worker( + ProcessGroupInfo( + world_size=world_size, + world_local_size=world_local_size, + rank=rank, + node_rank=node_rank, + local_rank=local_rank, + device=device, + ), + *args, + **kwargs, + ) + except Exception as ex: + print(ex) + traceback.print_exc() + raise + finally: + torch.distributed.destroy_process_group() + + +def parallel_launch( + world_size: int, + worker: Callable[Concatenate[ProcessGroupInfo, P], None], + *args: P.args, + **kwargs: P.kwargs, +) -> None: + assert not kwargs + spawn( + _worker_parallel_launch, + args=( + world_size, + world_size, + 0, + "tcp://localhost:29500", + worker, + ) + args, + nprocs=world_size, + join=True, + ) + + +## DeepEP specific utils + + +@dataclasses.dataclass +class DeepEPHTArgs: + num_local_experts: int + + +@dataclasses.dataclass +class DeepEPLLArgs: + max_tokens_per_rank: int + hidden_size: int + num_experts: int + use_fp8_dispatch: bool + + +def make_deepep_ht_a2a(pg: ProcessGroup, + pgi: ProcessGroupInfo, + dp_size: int, + ht_args: DeepEPHTArgs, + q_dtype: Optional[torch.dtype] = None, + block_shape: Optional[list[int]] = None): + + import deep_ep + + # high throughput a2a + num_nvl_bytes = 1024 * 1024 * 1024 # 1GB + num_rdma_bytes, low_latency_mode, num_qps_per_rank = 0, False, 1 + buffer = deep_ep.Buffer(group=pg, + num_nvl_bytes=num_nvl_bytes, + num_rdma_bytes=num_rdma_bytes, + low_latency_mode=low_latency_mode, + num_qps_per_rank=num_qps_per_rank) + return DeepEPHTPrepareAndFinalize(buffer=buffer, + world_size=pgi.world_size, + rank=pgi.rank, + dp_size=dp_size, + rank_expert_offset=pgi.rank * + ht_args.num_local_experts, + quant_dtype=q_dtype, + block_shape=block_shape) + + +def make_deepep_ll_a2a(pg: ProcessGroup, + pgi: ProcessGroupInfo, + dp_size: int, + deepep_ll_args: DeepEPLLArgs, + q_dtype: Optional[torch.dtype] = None, + block_shape: Optional[list[int]] = None): + + import deep_ep + + # low-latency a2a + num_rdma_bytes = deep_ep.Buffer.get_low_latency_rdma_size_hint( + deepep_ll_args.max_tokens_per_rank, deepep_ll_args.hidden_size, + pgi.world_size, deepep_ll_args.num_experts) + + buffer = deep_ep.Buffer(group=pg, + num_rdma_bytes=num_rdma_bytes, + low_latency_mode=True, + num_qps_per_rank=deepep_ll_args.num_experts // + pgi.world_size) + return DeepEPLLPrepareAndFinalize( + buffer=buffer, + world_size=pgi.world_size, + dp_size=dp_size, + max_tokens_per_rank=deepep_ll_args.max_tokens_per_rank, + quant_dtype=q_dtype, + use_fp8_dispatch=deepep_ll_args.use_fp8_dispatch, + ) + + +def make_deepep_a2a(pg: ProcessGroup, + pgi: ProcessGroupInfo, + dp_size: int, + deepep_ht_args: Optional[DeepEPHTArgs], + deepep_ll_args: Optional[DeepEPLLArgs], + q_dtype: Optional[torch.dtype] = None, + block_shape: Optional[list[int]] = None): + if deepep_ht_args is not None: + assert deepep_ll_args is None + return make_deepep_ht_a2a(pg, pgi, dp_size, deepep_ht_args, q_dtype, + block_shape) + + assert deepep_ll_args is not None + return make_deepep_ll_a2a(pg, pgi, dp_size, deepep_ll_args, q_dtype) diff --git a/tests/kernels/moe/test_deepep_deepgemm_moe.py b/tests/kernels/moe/test_deepep_deepgemm_moe.py new file mode 100644 index 0000000000000..a1fdc1d5ff47b --- /dev/null +++ b/tests/kernels/moe/test_deepep_deepgemm_moe.py @@ -0,0 +1,371 @@ +# SPDX-License-Identifier: Apache-2.0 +""" +Test DeepEP + DeepGEMM integration +""" + +import dataclasses +import importlib +from typing import Optional + +import pytest +import torch.distributed +from torch.distributed import ProcessGroup +from typing_extensions import ParamSpec + +from vllm.config import VllmConfig, set_current_vllm_config +from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts +from vllm.model_executor.layers.fused_moe.modular_kernel import ( + FusedMoEModularKernel) +from vllm.model_executor.layers.quantization.utils.fp8_utils import ( + per_token_group_quant_fp8) +from vllm.platforms import current_platform + +from .deepep_utils import ProcessGroupInfo, parallel_launch + +has_deep_ep = importlib.util.find_spec("deep_ep") is not None + +try: + import deep_gemm + has_deep_gemm = True +except ImportError: + has_deep_gemm = False + +if has_deep_ep: + from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import ( # noqa: E501 + DeepEPHTPrepareAndFinalize) + + from .deepep_utils import DeepEPHTArgs, make_deepep_a2a + +if has_deep_gemm: + from vllm.model_executor.layers.fused_moe.deep_gemm_moe import ( + DeepGemmExperts) + +requires_deep_ep = pytest.mark.skipif( + not has_deep_ep, + reason="Requires deep_ep kernels", +) + +requires_deep_gemm = pytest.mark.skipif( + not has_deep_gemm, + reason="Requires deep_gemm kernels", +) + +P = ParamSpec("P") + + +def per_block_cast_to_fp8( + x: torch.Tensor, + block_size_n: int = 128) -> tuple[torch.Tensor, torch.Tensor]: + assert x.dim() == 2 + m, n = x.shape + x_padded = torch.zeros( + (deep_gemm.ceil_div(m, 128) * 128, + deep_gemm.ceil_div(n, block_size_n) * block_size_n), + dtype=x.dtype, + device=x.device) + x_padded[:m, :n] = x + x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, block_size_n) + x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4) + x_scaled = (x_view * (448.0 / x_amax)).to(torch.float8_e4m3fn) + x_scaled_sub = x_scaled.view_as(x_padded)[:m, :n].contiguous() + scales = (x_amax / 448.0).view(x_view.size(0), x_view.size(2)) + return x_scaled_sub, scales + + +def make_block_quant_fp8_weights( + e: int, + n: int, + k: int, + block_size: list[int], +) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Return weights w1, w2, w1q, w2q, w1_scale, w2_scale + """ + dtype = torch.bfloat16 + + fp8_info = torch.finfo(torch.float8_e4m3fn) + fp8_max, fp8_min = fp8_info.max, fp8_info.min + + w1_bf16 = torch.randn((e, 2 * n, k), dtype=dtype) / 10 + w1_bf16 = w1_bf16.clamp(min=fp8_min, max=fp8_max).to(dtype=dtype) + + w2_bf16 = torch.randn((e, k, n), dtype=dtype) / 10 + w2_bf16 = w2_bf16.clamp(min=fp8_min, max=fp8_max).to(dtype=dtype) + + block_n, block_k = block_size[0], block_size[1] + n_tiles_w1 = ((2 * n) + block_n - 1) // block_n + k_tiles_w1 = (k + block_k - 1) // block_k + n_tiles_w2 = (k + block_n - 1) // block_n + k_tiles_w2 = (n + block_k - 1) // block_k + + w1 = torch.empty_like(w1_bf16, dtype=torch.float8_e4m3fn) + w2 = torch.empty_like(w2_bf16, dtype=torch.float8_e4m3fn) + + w1_s = torch.empty((e, n_tiles_w1, k_tiles_w1), + device="cuda", + dtype=torch.float32) + w2_s = torch.empty((e, n_tiles_w2, k_tiles_w2), + device="cuda", + dtype=torch.float32) + + assert w1_s.shape == (e, (2 * n + 127) // 128, (k + 127) // 128) + assert (w2.shape[-2] + block_n - 1) // block_n == w2_s.shape[-2] + + for i in range(e): + w1[i], w1_s[i] = per_block_cast_to_fp8(w1_bf16[i]) + w2[i], w2_s[i] = per_block_cast_to_fp8(w2_bf16[i]) + + return w1, w2, w1_s, w2_s + + +@dataclasses.dataclass +class TestConfig: + topk: int + m: int + k: int + n: int + num_experts: int + block_size: list[int] + + +@dataclasses.dataclass +class TestTensors: + rank_tokens: torch.Tensor # all ranks make this many tokens + rank_token_scales: Optional[torch.Tensor] + topk: torch.Tensor + topk_weights: torch.Tensor + config: TestConfig + + @staticmethod + def make(config: TestConfig, rank) -> "TestTensors": + + dtype = torch.bfloat16 + topk, m, k, block_size = (config.topk, config.m, config.k, + config.block_size) + + fp8_info = torch.finfo(torch.float8_e4m3fn) + fp8_max, fp8_min = fp8_info.max, fp8_info.min + + rank_tokens = torch.randn( + (m, k), device=torch.cuda.current_device(), dtype=dtype) / 10.0 + rank_tokens = rank_tokens.clamp(min=fp8_min, max=fp8_max) + + block_k = block_size[1] + _, rank_token_scales = per_token_group_quant_fp8(rank_tokens, block_k) + + topk_ids = torch.randint( + low=0, + high=config.num_experts, + size=(m, topk), + device=torch.cuda.current_device()).to(dtype=torch.int64) + + topk_weights = torch.randn(topk_ids.shape, + dtype=torch.float32, + device=torch.cuda.current_device()) + + return TestTensors(rank_tokens=rank_tokens, + rank_token_scales=rank_token_scales, + topk=topk_ids, + topk_weights=topk_weights, + config=config) + + +def make_modular_kernel(pg: ProcessGroup, pgi: ProcessGroupInfo, dp_size: int, + num_local_experts: int, q_dtype: Optional[torch.dtype], + block_shape: list[int]) -> FusedMoEModularKernel: + + a2a: DeepEPHTPrepareAndFinalize = make_deepep_a2a( + pg=pg, + pgi=pgi, + dp_size=dp_size, + deepep_ht_args=DeepEPHTArgs(num_local_experts=num_local_experts), + deepep_ll_args=None, + q_dtype=q_dtype, + block_shape=block_shape) + + fused_experts = DeepGemmExperts() + mk = FusedMoEModularKernel(prepare_finalize=a2a, + fused_experts=fused_experts) + return mk + + +def deep_ep_moe_impl(pg: ProcessGroup, pgi: ProcessGroupInfo, dp_size: int, + test_tensors: TestTensors, w1: torch.Tensor, + w2: torch.Tensor, w1_scale: Optional[torch.Tensor], + w2_scale: Optional[torch.Tensor], + num_experts: int) -> torch.Tensor: + + num_local_experts = w1.size(0) + + def build_expert_map(): + num_local_experts = w1.size(0) + expert_map = torch.full((num_experts, ), + fill_value=-1, + dtype=torch.int32) + s = pgi.rank * num_local_experts + e = s + num_local_experts + expert_map[s:e] = torch.tensor(list(range(num_local_experts))) + return expert_map.to(device=torch.cuda.current_device(), + dtype=torch.int32) + + q_dtype = torch.float8_e4m3fn + + # Make modular kernel + mk: FusedMoEModularKernel = make_modular_kernel( + pg, pgi, dp_size, num_local_experts, q_dtype, + test_tensors.config.block_size) + + a1_scale = test_tensors.rank_token_scales + + out = mk.forward(hidden_states=test_tensors.rank_tokens, + w1=w1, + w2=w2, + topk_weights=test_tensors.topk_weights, + topk_ids=test_tensors.topk, + inplace=False, + activation="silu", + global_num_experts=num_experts, + expert_map=build_expert_map(), + w1_scale=w1_scale, + w2_scale=w2_scale, + w1_zp=None, + w2_zp=None, + a1_scale=a1_scale, + a2_scale=None, + apply_router_weight_on_input=False) + return out + + +def triton_impl(a: torch.Tensor, topk_ids: torch.Tensor, + topk_weights: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor, + w1_scale: torch.Tensor, w2_scale: torch.Tensor, + a1_scale: torch.Tensor, block_shape: list[int]): + + return fused_experts( + hidden_states=a, + w1=w1, + w2=w2, + topk_weights=topk_weights, + topk_ids=topk_ids, + inplace=False, + use_fp8_w8a8=True, + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_scale=a1_scale, + block_shape=block_shape, + # Make sure this is set to False so we + # dont end up comparing the same implementation. + allow_deep_gemm=False) + + +def _deep_ep_moe( + pgi: ProcessGroupInfo, + dp_size: int, + config: TestConfig, + w1: torch.Tensor, + w2: torch.Tensor, + w1_scale: torch.Tensor, + w2_scale: torch.Tensor, +): + current_platform.seed_everything(pgi.rank) + + w1 = w1.to(device=torch.cuda.current_device()) + w2 = w2.to(device=torch.cuda.current_device()) + w1_scale = w1_scale.to(device=torch.cuda.current_device()) + w2_scale = w2_scale.to(device=torch.cuda.current_device()) + + pg = torch.distributed.new_group(list(range(pgi.world_size))) + test_tensors = TestTensors.make(config, pgi.rank) + block_shape = [ + w1.size(1) // w1_scale.size(1), + w1.size(2) // w1_scale.size(2) + ] + + with set_current_vllm_config(VllmConfig()): + # Reference + triton_moe = triton_impl(a=test_tensors.rank_tokens, + topk_ids=test_tensors.topk, + topk_weights=test_tensors.topk_weights, + w1=w1, + w2=w2, + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_scale=test_tensors.rank_token_scales, + block_shape=block_shape) + + # Slice experts for this rank. + num_local_experts = config.num_experts // pgi.world_size + e_start = num_local_experts * pgi.rank + e_end = e_start + num_local_experts + w1_ep = w1[e_start:e_end] + w2_ep = w2[e_start:e_end] + w1_scale_ep = w1_scale[e_start:e_end] + w2_scale_ep = w2_scale[e_start:e_end] + + deepep_moe = deep_ep_moe_impl( + pg, + pgi, + dp_size, + test_tensors, + w1_ep, + w2_ep, + w1_scale_ep, + w2_scale_ep, + config.num_experts, + ) + + torch.testing.assert_close( + triton_moe, + deepep_moe, + atol=6e-2, + rtol=6e-2, + ) + + +MNKs = [ + (8, 128, 128), + (8, 128, 512), + (8, 512, 512), + (3, 1024, 2048), + (32, 128, 1024), + (45, 512, 2048), + (64, 1024, 1024), + (129, 128, 256), + (129, 1024, 2048), + (222, 1024, 2048), +] + + +@pytest.mark.parametrize("mnk", MNKs) +@pytest.mark.parametrize("num_experts", [32]) +@pytest.mark.parametrize("topk", [2, 6]) +@pytest.mark.parametrize("world_dp_size", [(2, 1)]) +@requires_deep_ep +@requires_deep_gemm +def test_deep_ep_moe(mnk: tuple[int, int, int], num_experts: int, topk: int, + world_dp_size: tuple[int, int]): + + m, n, k = mnk + current_platform.seed_everything(7) + + if topk > num_experts: + pytest.skip(f"Skipping test: topk={topk} > E={num_experts}") + + block_m = deep_gemm.get_m_alignment_for_contiguous_layout() + block_size = [block_m, block_m] + + world_size, dp_size = world_dp_size + config = TestConfig( + topk=topk, + m=m, + k=k, + n=n, + num_experts=num_experts, + block_size=block_size, + ) + + w1, w2, w1_scale, w2_scale = make_block_quant_fp8_weights( + num_experts, n, k, block_size) + + parallel_launch(world_size, _deep_ep_moe, dp_size, config, w1, w2, + w1_scale, w2_scale) diff --git a/tests/kernels/moe/test_deepep_moe.py b/tests/kernels/moe/test_deepep_moe.py new file mode 100644 index 0000000000000..7e029ea950555 --- /dev/null +++ b/tests/kernels/moe/test_deepep_moe.py @@ -0,0 +1,459 @@ +# SPDX-License-Identifier: Apache-2.0 +""" +Test deepep dispatch-combine logic +""" + +import dataclasses +import importlib +from typing import Optional, Union + +import pytest +import torch.distributed +from torch.distributed import ProcessGroup + +from vllm import _custom_ops as ops +from vllm.config import VllmConfig, set_current_vllm_config +from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.fused_moe import TritonExperts +from vllm.model_executor.layers.fused_moe.fused_batched_moe import ( + BatchedTritonExperts) +from vllm.model_executor.layers.fused_moe.modular_kernel import ( + FusedMoEModularKernel) +from vllm.model_executor.layers.quantization.utils.fp8_utils import ( + per_token_group_quant_fp8) +from vllm.platforms import current_platform + +from .deepep_utils import ProcessGroupInfo, parallel_launch + +has_deep_ep = importlib.util.find_spec("deep_ep") is not None + +if has_deep_ep: + from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import ( # noqa: E501 + DeepEPHTPrepareAndFinalize) + from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import ( # noqa: E501 + DeepEPLLPrepareAndFinalize) + + from .deepep_utils import DeepEPHTArgs, DeepEPLLArgs, make_deepep_a2a + +requires_deep_ep = pytest.mark.skipif( + not has_deep_ep, + reason="Requires deep_ep kernels", +) + +MAX_TOKENS_PER_RANK = 64 + + +def make_weights( + e, n, k, dtype +) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Return weights w1, w2, w1_scale, w2_scale + """ + if dtype in [torch.float16, torch.bfloat16]: + w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10 + w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10 + return w1, w2, None, None + + # per-out-channel weight quantization + assert dtype == torch.float8_e4m3fn + w1 = torch.empty((e, 2 * n, k), device="cuda", dtype=torch.float16) + w2 = torch.empty((e, k, n), device="cuda", dtype=torch.float16) + + n_b_scales = 2 * n + k_b_scales = k + w1_q = torch.empty_like(w1, dtype=dtype) + w2_q = torch.empty_like(w2, dtype=dtype) + w1_scale = torch.empty((e, n_b_scales, 1), + device="cuda", + dtype=torch.float32) + w2_scale = torch.empty((e, k_b_scales, 1), + device="cuda", + dtype=torch.float32) + for expert in range(e): + w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant( + w1[expert], use_per_token_if_dynamic=True) + w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant( + w2[expert], use_per_token_if_dynamic=True) + return w1_q, w2_q, w1_scale, w2_scale + + +@dataclasses.dataclass +class TestConfig: + dtype: torch.dtype + topk: int + m: int + k: int + n: int + num_experts: int + + +@dataclasses.dataclass +class TestTensors: + rank_tokens: torch.Tensor # all ranks make this many tokens + rank_token_scales: Optional[torch.Tensor] + topk: torch.Tensor + topk_weights: torch.Tensor + config: TestConfig + + @staticmethod + def make(config: TestConfig, low_latency_mode: bool) -> "TestTensors": + # TODO (varun) - check that float16 works ? + assert config.dtype in [torch.bfloat16, torch.float8_e4m3fn] + token_dtype = (torch.bfloat16 if config.dtype == torch.float8_e4m3fn + else config.dtype) + rank_tokens = torch.randn( + (config.m, config.k), device="cuda", dtype=token_dtype) / 10 + rank_token_scales = None + if config.dtype == torch.float8_e4m3fn: + # low_latency_mode kernels dont support per-token quant. + _, rank_token_scales = ops.scaled_fp8_quant( + rank_tokens, use_per_token_if_dynamic=not low_latency_mode) + + topk = torch.randint(low=0, + high=config.num_experts, + size=(config.m, config.topk), + device="cuda").to(dtype=torch.int64) + topk_weights = torch.randn(topk.shape, + dtype=torch.float32, + device="cuda") + return TestTensors(rank_tokens=rank_tokens, + rank_token_scales=rank_token_scales, + topk=topk, + topk_weights=topk_weights, + config=config) + + +def make_modular_kernel(pg: ProcessGroup, pgi: ProcessGroupInfo, + low_latency_mode: bool, hidden_size: int, dp_size: int, + num_experts: int, num_local_experts: int, + q_dtype: Optional[torch.dtype], + use_fp8_dispatch: bool) -> FusedMoEModularKernel: + + is_quantized = q_dtype is not None + + ht_args: Optional[DeepEPHTArgs] = None + ll_args: Optional[DeepEPLLArgs] = None + + if low_latency_mode: + ll_args = DeepEPLLArgs(max_tokens_per_rank=MAX_TOKENS_PER_RANK, + hidden_size=hidden_size, + num_experts=num_experts, + use_fp8_dispatch=use_fp8_dispatch) + else: + assert not use_fp8_dispatch, ( + "FP8 Dispatch is valid only for low-latency kernels") + ht_args = DeepEPHTArgs(num_local_experts=num_local_experts) + + a2a : Union[DeepEPHTPrepareAndFinalize, DeepEPLLPrepareAndFinalize] = \ + make_deepep_a2a(pg = pg, + pgi = pgi, + dp_size = dp_size, + q_dtype = q_dtype, + block_shape = None, + deepep_ht_args = ht_args, + deepep_ll_args = ll_args) + + if low_latency_mode: + fused_experts = BatchedTritonExperts( + max_num_tokens=MAX_TOKENS_PER_RANK, + world_size=pgi.world_size, + dp_size=dp_size, + use_fp8_w8a8=is_quantized, + use_int8_w8a8=False, + use_int8_w8a16=False, + use_int4_w4a16=False) + else: + fused_experts = TritonExperts(use_fp8_w8a8=is_quantized, + use_int8_w8a8=False, + use_int8_w8a16=False, + use_int4_w4a16=False, + per_channel_quant=False) + + mk = FusedMoEModularKernel(prepare_finalize=a2a, + fused_experts=fused_experts) + return mk + + +def deep_ep_moe_impl(pg: ProcessGroup, pgi: ProcessGroupInfo, + low_latency_mode: bool, dp_size: int, + test_tensors: TestTensors, w1: torch.Tensor, + w2: torch.Tensor, w1_scale: Optional[torch.Tensor], + w2_scale: Optional[torch.Tensor], num_experts: int, + use_fp8_dispatch: bool) -> torch.Tensor: + + num_local_experts = w1.size(0) + + def build_expert_map(): + num_local_experts = w1.size(0) + expert_map = torch.full((num_experts, ), + fill_value=-1, + dtype=torch.int32) + s = pgi.rank * num_local_experts + e = s + num_local_experts + expert_map[s:e] = torch.tensor(list(range(num_local_experts))) + return expert_map.to(device=torch.cuda.current_device(), + dtype=torch.int32) + + hidden_size = test_tensors.rank_tokens.size(1) + is_quantized = w1.dtype == torch.float8_e4m3fn + q_dtype = None + if is_quantized: + q_dtype = torch.float8_e4m3fn + + # Make modular kernel + mk: FusedMoEModularKernel = make_modular_kernel(pg, pgi, low_latency_mode, + hidden_size, dp_size, + num_experts, + num_local_experts, q_dtype, + use_fp8_dispatch) + + out_hidden_states = torch.empty_like(test_tensors.rank_tokens) + total_num_tokens = test_tensors.rank_tokens.size(0) + + def process_chunk(chunk_start, chunk_end, skip_result_store=False): + rank_tokens_chunk = test_tensors.rank_tokens[chunk_start:chunk_end] + topk_weights_chunk = test_tensors.topk_weights[chunk_start:chunk_end] + topk_chunk = test_tensors.topk[chunk_start:chunk_end] + rank_token_scales_chunk = test_tensors.rank_token_scales + if rank_token_scales_chunk is not None and rank_token_scales_chunk.size( + 0) == total_num_tokens: + # per act token + rank_token_scales_chunk = rank_token_scales_chunk[ + chunk_start:chunk_end] + + out = mk.forward(hidden_states=rank_tokens_chunk, + w1=w1, + w2=w2, + topk_weights=topk_weights_chunk, + topk_ids=topk_chunk, + inplace=False, + activation="silu", + global_num_experts=num_experts, + expert_map=build_expert_map(), + w1_scale=w1_scale, + w2_scale=w2_scale, + w1_zp=None, + w2_zp=None, + a1_scale=rank_token_scales_chunk, + a2_scale=None, + apply_router_weight_on_input=False) + + if not skip_result_store: + out_hidden_states[chunk_start:chunk_end, :].copy_( + out, non_blocking=True) + + max_num_tokens_per_dp = (MAX_TOKENS_PER_RANK + if low_latency_mode else total_num_tokens) + + for chunk_start_ in range(0, total_num_tokens, max_num_tokens_per_dp): + chunk_start = chunk_start_ + chunk_end = min(chunk_start + max_num_tokens_per_dp, total_num_tokens) + # clamp start and end + chunk_start = min(chunk_start, total_num_tokens - 1) + chunk_end = min(chunk_end, total_num_tokens) + + process_chunk(chunk_start, + chunk_end, + skip_result_store=chunk_start_ >= total_num_tokens) + + return out_hidden_states + + +def torch_moe_impl(test_tensors: TestTensors, w1: torch.Tensor, + w2: torch.Tensor, w1_scale: Optional[torch.Tensor], + w2_scale: Optional[torch.Tensor], using_fp8_dispatch: bool): + + a, topk_ids, topk_weights = (test_tensors.rank_tokens, test_tensors.topk, + test_tensors.topk_weights) + if using_fp8_dispatch: + # The DeepEP implementation is requested to dispatch using FP8. + # For numerical stability for testing, emulate the fp8 dispatch by + # blockwise quant and de-quant. + a = test_tensors.rank_tokens + aq, aq_scale = per_token_group_quant_fp8(a, 128) + a = (aq.view(-1, 128).to(torch.float32) * aq_scale.view(-1, 1)).view( + a.shape).to(a.dtype) + + is_quantized = w1.dtype == torch.float8_e4m3fn + a_dtype = a.dtype + if is_quantized: + w1 = w1.to(dtype=torch.float32) * w1_scale + w2 = w2.to(dtype=torch.float32) * w2_scale + a = a.to(dtype=torch.float32) + + m, _ = a.shape + topk = topk_ids.size(1) + out = torch.zeros_like(a) + + for i in range(m): + a_i = a[i] + o_i = out[i] + for j in range(topk): + e = topk_ids[i][j] + e_w = topk_weights[i][j] + w1_e = w1[e] + w2_e = w2[e] + o_i += (SiluAndMul() + (a_i @ w1_e.transpose(0, 1)) @ w2_e.transpose(0, 1)) * e_w + + if is_quantized: + out = out.to(dtype=a_dtype) + + return out + + +def _deep_ep_moe( + pgi: ProcessGroupInfo, + low_latency_mode: bool, + dp_size: int, + config: TestConfig, + w1: torch.Tensor, + w2: torch.Tensor, + w1_scale: Optional[torch.Tensor], + w2_scale: Optional[torch.Tensor], + use_fp8_dispatch: bool, +): + + if not low_latency_mode: + assert not use_fp8_dispatch, ( + "FP8 dispatch interface is available only in low-latency mode") + + is_quantized = w1.dtype == torch.float8_e4m3fn + w1 = w1.to(device=torch.cuda.current_device()) + w2 = w2.to(device=torch.cuda.current_device()) + if is_quantized: + w1_scale = w1_scale.to( # type: ignore + device=torch.cuda.current_device()) + w2_scale = w2_scale.to( # type: ignore + device=torch.cuda.current_device()) + + pg = torch.distributed.new_group(list(range(pgi.world_size))) + test_tensors = TestTensors.make(config, low_latency_mode) + + with set_current_vllm_config(VllmConfig()): + # Reference + torch_combined = torch_moe_impl(test_tensors, w1, w2, w1_scale, + w2_scale, use_fp8_dispatch) + + # Splice experts for this rank. + num_local_experts = config.num_experts // pgi.world_size + e_start = num_local_experts * pgi.rank + e_end = e_start + num_local_experts + w1_ep = w1[e_start:e_end] + w2_ep = w2[e_start:e_end] + + w1_scale_ep, w2_scale_ep = None, None + if is_quantized: + w1_scale_ep = w1_scale[e_start:e_end] # type: ignore + w2_scale_ep = w2_scale[e_start:e_end] # type: ignore + deepep_combined = deep_ep_moe_impl( + pg, + pgi, + low_latency_mode, + dp_size, + test_tensors, + w1_ep, + w2_ep, + w1_scale_ep, + w2_scale_ep, + config.num_experts, + use_fp8_dispatch, + ) + + torch.testing.assert_close( + torch_combined, + deepep_combined, + atol=6e-2, + rtol=6e-2, + ) + + +MNKs = [ + (1, 128, 128), + (2, 128, 512), + (3, 1024, 2048), + (32, 128, 1024), + (45, 512, 2048), + (64, 1024, 1024), + (222, 1024, 2048), +] + +DTYPES = [torch.bfloat16, torch.float8_e4m3fn] + + +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("mnk", MNKs) +@pytest.mark.parametrize("num_experts", [32]) +@pytest.mark.parametrize("topk", [6]) +@pytest.mark.parametrize("world_dp_size", [(2, 1)]) +@requires_deep_ep +def test_deep_ep_moe(dtype: torch.dtype, mnk: tuple[int, int, int], + num_experts: int, topk: int, world_dp_size: tuple[int, + int]): + low_latency_mode = False + use_fp8_dispatch = False + m, n, k = mnk + + current_platform.seed_everything(7) + world_size, dp_size = world_dp_size + config = TestConfig(dtype=dtype, + topk=topk, + m=m, + k=k, + n=n, + num_experts=num_experts) + + w1, w2, w1_scale, w2_scale = make_weights(num_experts, n, k, dtype) + + parallel_launch(world_size, _deep_ep_moe, low_latency_mode, dp_size, + config, w1, w2, w1_scale, w2_scale, use_fp8_dispatch) + + +MNKs = [ + (1, 128, 2560), + (2, 128, 2560), + (3, 1024, 2560), + (32, 128, 2560), + (45, 512, 2560), + (64, 1024, 2560), + (222, 1024, 2560), +] +DTYPES = [torch.float8_e4m3fn, torch.bfloat16] +USE_FP8_DISPATCH = [True, False] + + +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("mnk", MNKs) +@pytest.mark.parametrize("num_experts", [32]) +@pytest.mark.parametrize("topk", [6]) +@pytest.mark.parametrize("world_dp_size", [(2, 1)]) +@pytest.mark.parametrize("use_fp8_dispatch", USE_FP8_DISPATCH) +@requires_deep_ep +def test_low_latency_deep_ep_moe(dtype: torch.dtype, mnk: tuple[int, int, int], + num_experts: int, topk: int, + world_dp_size: tuple[int, int], + use_fp8_dispatch: bool): + + low_latency_mode = True + m, n, k = mnk + + if (low_latency_mode + and k not in DeepEPLLPrepareAndFinalize.SUPPORTED_HIDDEN_SIZES): + pytest.skip( + f"Skipping test as hidden size {k} is not in list of supported " + f"hidden sizes {DeepEPLLPrepareAndFinalize.SUPPORTED_HIDDEN_SIZES}" + ) + + current_platform.seed_everything(7) + world_size, dp_size = world_dp_size + config = TestConfig(dtype=dtype, + topk=topk, + m=m, + k=k, + n=n, + num_experts=num_experts) + + w1, w2, w1_scale, w2_scale = make_weights(num_experts, n, k, dtype) + + parallel_launch(world_size, _deep_ep_moe, low_latency_mode, dp_size, + config, w1, w2, w1_scale, w2_scale, use_fp8_dispatch) diff --git a/vllm/config.py b/vllm/config.py index d99e501ca279a..f6ca9328b8a19 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1856,6 +1856,8 @@ class ParallelConfig: factors.append(self.pipeline_parallel_size) factors.append(self.tensor_parallel_size) factors.append(self.enable_expert_parallel) + factors.append(self.data_parallel_size) + factors.append(envs.VLLM_ALL2ALL_BACKEND) return hashlib.sha256(str(factors).encode()).hexdigest() def __post_init__(self) -> None: diff --git a/vllm/distributed/device_communicators/all2all.py b/vllm/distributed/device_communicators/all2all.py index ae75902994423..2ab3779ece056 100644 --- a/vllm/distributed/device_communicators/all2all.py +++ b/vllm/distributed/device_communicators/all2all.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import importlib.util -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any import torch import torch.distributed as dist @@ -129,3 +129,147 @@ class PPLXAll2AllManager(All2AllManagerBase): from pplx_kernels.nvshmem import nvshmem_finalize logger.debug("PPLX NVSHMEM finalize") nvshmem_finalize() + + +class DeepEPAll2AllManagerBase(All2AllManagerBase): + """ + All2All communication based on DeepEP High-Throughput kernels. + """ + + def __init__(self, cpu_group): + has_deepep = importlib.util.find_spec("deep_ep") is not None + assert has_deepep, "DeepEP kernels not found. Please follow https://github.com/vllm-project/vllm/blob/main/tools/ep_kernels/README.md to install DeepEP kernels." # noqa + super().__init__(cpu_group) + self.handle_cache = Cache() + + # This is the DeepEP default. Stick to it till we can establish + # reasonable defaults based on profiling. + self.num_sms = 20 + + def get_handle(self, kwargs): + raise NotImplementedError + + def dispatch(self, hidden_states: torch.Tensor, + router_logits: torch.Tensor): + raise NotImplementedError + + def combine(self, hidden_states: torch.Tensor) -> torch.Tensor: + raise NotImplementedError + + def destroy(self): + pass + + +class DeepEPHTAll2AllManager(DeepEPAll2AllManagerBase): + """ + All2All communication based on DeepEP High-Throughput kernels. + """ + + def __init__(self, cpu_group): + super().__init__(cpu_group) + + def _make_all2all_kwargs(self) -> dict[Any, Any]: + # Defaults for internode and intranode are taken from DeepEP tests. + num_nvl_bytes = 1024 * 1024 * 1024 + num_rdma_bytes = None + num_qps_per_rank = None + + if self.internode: + num_rdma_bytes = 1024 * 1024 * 1024 + num_qps_per_rank = self.num_sms // 2 + else: + assert self.intranode + num_rdma_bytes = 0 + num_qps_per_rank = 1 + + assert num_rdma_bytes is not None + assert num_qps_per_rank is not None + return dict(group=self.cpu_group, + num_nvl_bytes=num_nvl_bytes, + num_rdma_bytes=num_rdma_bytes, + low_latency_mode=False, + num_qps_per_rank=num_qps_per_rank) + + def get_handle(self, kwargs): + + assert len(kwargs) == 0, ( + "DeepEPHTAll2AllManager expects no arguments. All the required " + "args are computed in the Manager itself.") + + import deep_ep + buffer_kwargs = self._make_all2all_kwargs() + logger.debug("DeepEP all2all args %s", buffer_kwargs) + handle: deep_ep.Buffer = self.handle_cache.get_or_create( + buffer_kwargs, deep_ep.Buffer) + # It is dangerous to set num sms outside this function. num_sms is not + # a part of the hash-key that identifies this object. If we are in a + # situation where we make objects with different num_sms, the hash key + # in get_or_create must be updated. + handle.set_num_sms(self.num_sms) + return handle + + +class DeepEPLLAll2AllManager(DeepEPAll2AllManagerBase): + """ + All2All communication based on DeepEP Low-Latency kernels. + """ + + def __init__(self, cpu_group): + super().__init__(cpu_group) + + def _make_all2all_kwargs( + self, + max_num_tokens_per_dp_rank: int, + token_hidden_size: int, + num_ep_ranks: int, + num_global_experts: int, + num_local_experts: int, + ) -> dict[Any, Any]: + """ + max_num_tokens_per_dp_rank : the maximum number of tokens a DP rank + can dispatch all the ranks must hold the same value. + token_hidden_size: the hidden dimension of each token. + num_ep_ranks: the number of EP group ranks. + num_global_experts: Number of experts in the model. + num_local_experts: Number of experts in an EP rank. + """ + import deep_ep + + # Defaults for internode and intranode are taken from DeepEP tests. + num_nvl_bytes = 1024 * 1024 * 1024 + num_qps_per_rank = num_local_experts + num_rdma_bytes = None + + if self.internode: + num_rdma_bytes = 1024 * 1024 * 1024 + else: + assert self.intranode + num_rdma_bytes = deep_ep.Buffer.get_low_latency_rdma_size_hint( + num_max_dispatch_tokens_per_rank=max_num_tokens_per_dp_rank, + hidden=token_hidden_size, + num_ranks=num_ep_ranks, + num_experts=num_global_experts) + + assert num_rdma_bytes is not None + return dict(group=self.cpu_group, + num_nvl_bytes=num_nvl_bytes, + num_rdma_bytes=num_rdma_bytes, + low_latency_mode=True, + num_qps_per_rank=num_qps_per_rank) + + def get_handle(self, kwargs): + """ + The kwargs for DeepEPLLAll2AllManager is dictated by + _make_all2all_kwargs. + """ + import deep_ep + buffer_kwargs = self._make_all2all_kwargs(**kwargs) + logger.debug("DeepEP all2all args %s", buffer_kwargs) + handle: deep_ep.Buffer = self.handle_cache.get_or_create( + buffer_kwargs, deep_ep.Buffer) + # It is dangerous to set num sms outside this function. num_sms is not + # a part of the hash-key that identifies this object. If we are in a + # situation where we make objects with different num_sms, the hash key + # in get_or_create must be updated. + handle.set_num_sms(self.num_sms) + return handle diff --git a/vllm/distributed/device_communicators/cuda_communicator.py b/vllm/distributed/device_communicators/cuda_communicator.py index 0eebdf8736ce2..055d91690e676 100644 --- a/vllm/distributed/device_communicators/cuda_communicator.py +++ b/vllm/distributed/device_communicators/cuda_communicator.py @@ -67,6 +67,14 @@ class CudaCommunicator(DeviceCommunicatorBase): from .all2all import PPLXAll2AllManager self.all2all_manager = PPLXAll2AllManager(self.cpu_group) logger.info("Using PPLX all2all manager.") + elif all2all_backend == "deepep_high_throughput": + from .all2all import DeepEPHTAll2AllManager + self.all2all_manager = DeepEPHTAll2AllManager(self.cpu_group) + logger.info("Using DeepEP High-Throughput all2all manager.") + elif all2all_backend == "deepep_low_latency": + from .all2all import DeepEPLLAll2AllManager + self.all2all_manager = DeepEPLLAll2AllManager(self.cpu_group) + logger.info("Using DeepEP Low-Latency all2all manager.") else: raise ValueError(f"Unknown all2all backend: {all2all_backend}") diff --git a/vllm/envs.py b/vllm/envs.py index 2e3d6eeb57e8a..08bf2dad44554 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -826,6 +826,8 @@ environment_variables: dict[str, Callable[[], Any]] = { # Available options: # - "naive": naive all2all implementation using all-reduce # - "pplx": use pplx kernels + # - "deepep_high_throughput", use deepep high-throughput kernels + # - "deepep_low_latency", use deepep low-latency kernels "VLLM_ALL2ALL_BACKEND": lambda: os.getenv("VLLM_ALL2ALL_BACKEND", "naive"), diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py index 331544d64ff83..97b4a49c064eb 100644 --- a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py +++ b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py @@ -12,8 +12,8 @@ from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import ( _moe_permute) from vllm.model_executor.layers.fused_moe.prepare_finalize import ( MoEPrepareAndFinalizeNoEP) -from vllm.model_executor.layers.fused_moe.utils import (_fp8_quantize, - _resize_cache) +from vllm.model_executor.layers.fused_moe.utils import ( + _resize_cache, per_token_group_quant_fp8) from vllm.utils import round_up logger = init_logger(__name__) @@ -34,10 +34,8 @@ def _valid_deep_gemm_shape(M: int, N: int, K: int): return align <= M and N % align == 0 and K % align == 0 -def _valid_deep_gemm(hidden_states: torch.Tensor, - w1: torch.Tensor, - w2: torch.Tensor, - expert_map: Optional[torch.Tensor] = None) -> bool: +def _valid_deep_gemm(hidden_states: torch.Tensor, w1: torch.Tensor, + w2: torch.Tensor) -> bool: """ Check if the given problem size is supported by the DeepGemm grouped gemm kernel. All of M, N, K and the quantization block_shape must be @@ -47,10 +45,6 @@ def _valid_deep_gemm(hidden_states: torch.Tensor, logger.debug("DeepGemm disabled: deep_gemm not available.") return False - if expert_map is not None: - logger.debug("DeepGemm disabled: expert map NYI.") - return False - M = hidden_states.size(0) _, K, N = w2.size() if not _valid_deep_gemm_shape(M, N, K): @@ -116,7 +110,9 @@ class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): a1q = hidden_states _, N, K = w1.size() - assert global_num_experts != -1 + if global_num_experts == -1: + global_num_experts = w1.size(0) + assert w2.size(1) == K a1q, a1q_scale, _, expert_ids, inv_perm = _moe_permute( @@ -128,6 +124,14 @@ class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): self.block_shape[0], ) + if expert_map is not None: + # DeepGemm (Grouped Contiguous) kernel needs a valid B index + # for all rows of A. To that effect, simply compute with + # the 0th weight matrix. + # Note that this relies on the fact that corresponding topk + # weights would be 0 during weight multiplication. + expert_ids = torch.where(expert_ids == -1, 0, expert_ids) + # Note: M_sum is different than the pre-permuted shape of a1q. M_sum = a1q.size(0) workspace1 = _resize_cache(workspace13, (M_sum, N)) @@ -140,9 +144,9 @@ class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): self.activation(activation, workspace2, workspace1.view(-1, N)) a2q_scale: Optional[torch.Tensor] = None - - a2q, a2q_scale = _fp8_quantize(workspace2, a2_scale, False, - self.block_shape) + a2q, a2q_scale = per_token_group_quant_fp8(workspace2, + self.block_shape[1], + column_major_scales=True) dg.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous( (a2q, a2q_scale), (w2, w2_scale), workspace3, expert_ids) diff --git a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py new file mode 100644 index 0000000000000..48cf01638ade4 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py @@ -0,0 +1,236 @@ +# SPDX-License-Identifier: Apache-2.0 +from typing import Optional + +import deep_ep +import torch + +import vllm.model_executor.layers.fused_moe.modular_kernel as mk +from vllm.model_executor.layers.fused_moe.utils import ( + moe_kernel_quantize_input) + + +class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): + """ + Prepare/Finalize using DeepEP High-Throughput kernels. + """ + + def __init__(self, + buffer: deep_ep.Buffer, + world_size: int, + rank: int, + dp_size: int, + rank_expert_offset: int, + quant_dtype: Optional[torch.dtype] = None, + block_shape: Optional[list[int]] = None): + super().__init__() + self.buffer = buffer + self.world_size = world_size + self.rank = rank + self.dp_size = dp_size + self.rank_expert_offset = rank_expert_offset + self.quant_dtype = quant_dtype + self.block_shape = block_shape + # The dispatch function returns a handle that the combine function + # requires. We store the handle here so it is available to the + # combine function. + self.handle = None + + # From https://github.com/deepseek-ai/DeepEP/blob/9fe9021f29c9083cd1808ab36b740208524d9f63/deep_ep/buffer.py#L164 + self.available_rank_configs = [2, 4, 8, 16, 24, 32, 64, 128, 144, 160] + + def max_num_tokens_per_rank(self) -> Optional[int]: + return None + + def topk_indices_dtype(self) -> Optional[torch.dtype]: + return torch.int64 + + def _get_dispatch_config(self) -> Optional[deep_ep.Config]: + if self.dp_size not in self.available_rank_configs: + return None + return deep_ep.Buffer.get_dispatch_config(self.dp_size) + + def _get_combine_config(self) -> Optional[deep_ep.Config]: + if self.dp_size not in self.available_rank_configs: + return None + return deep_ep.Buffer.get_combine_config(self.dp_size) + + def _do_quant(self, tokens: torch.Tensor, + token_scales: Optional[torch.Tensor], per_act_token: bool): + tokens, token_scales = moe_kernel_quantize_input( + tokens, token_scales, self.quant_dtype, per_act_token, + self.block_shape) + return tokens, token_scales + + def _do_dispatch(self, tokens: torch.Tensor, + token_scales: Optional[torch.Tensor], + rank_topk_ids: torch.Tensor, + rank_topk_weights: torch.Tensor, num_experts: int): + + has_scales = token_scales is not None + + (num_tokens_per_rank, num_tokens_per_rdma_rank, expert_num_tokens, + is_token_in_rank, event) = self.buffer.get_dispatch_layout( + topk_idx=rank_topk_ids, + num_experts=num_experts, + previous_event=None, + async_finish=False, + allocate_on_comm_stream=False) + + token_data = tokens + if has_scales: + token_data = (tokens, token_scales) + + ( + token_data, expert_topk_ids, expert_topk_weights, + expert_num_tokens_per_expert_list, self.handle, event + ) = self.buffer.dispatch( + x=token_data, + handle=None, + num_tokens_per_rank=num_tokens_per_rank, + num_tokens_per_rdma_rank=num_tokens_per_rdma_rank, + is_token_in_rank=is_token_in_rank, + num_tokens_per_expert=expert_num_tokens, + topk_idx=rank_topk_ids, + topk_weights=rank_topk_weights, + # expert_alignment rounds the number of tokens per expert + # to this value. + expert_alignment=1, + config=self._get_dispatch_config(), + previous_event=None, + async_finish=False, + allocate_on_comm_stream=False) + + if has_scales: + expert_x, expert_x_scale = token_data + else: + expert_x, expert_x_scale = token_data, None + + # The existing MOE kernels assume that all entries of topk_ids are + # valid. To that effect, set the -1s in expert_topk_ids to some expert + # outside this rank so the expert_map can remap it to -1 when safe. + # With Expert Parallel, the experts are divided amongst the rank + # sequentially. For rank 0, set it to num_experts - 1 and for all other + # ranks set it to 0 as we know that expert_map will have a -1 in those + # regions for those ranks. + # + # DeepEP's topk_ids output refers to the local experts directly. Offset + # the topk_ids to move it back to the global experts space so it aligns + # with existing vLLM interfaces. + expert_topk_ids = torch.where( + expert_topk_ids == -1, + num_experts - 1 if self.rank_expert_offset == 0 else 0, + expert_topk_ids + self.rank_expert_offset) + + return (expert_x, expert_x_scale, expert_num_tokens, expert_topk_ids, + expert_topk_weights) + + def prepare( + self, + a1: torch.Tensor, + a1_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], + rank_topk_weights: torch.Tensor, + rank_topk_ids: torch.Tensor, + num_experts: int, + expert_map: Optional[torch.Tensor], + apply_router_weight_on_input: bool, + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor], + Optional[torch.Tensor], Optional[torch.Tensor]]: + + if apply_router_weight_on_input: + topk = rank_topk_ids.size(1) + # TODO: this only works for topK=1, will need to update for topK>1 + assert topk == 1, ( + "apply_router_weight_on_input is only implemented for topk=1") + a1 = a1 * rank_topk_weights.to(a1.dtype) + + # Check if there is a block_shape / or if we can infer the quantization + # schemes from the scales. + per_token_quant = None + if all([x is None for x in [self.block_shape, a1_scale, a2_scale] + ]) and self.quant_dtype is not None: + # Quantization required despite none of the inputs suggesting + # quantization. Fallback to per_token_dynamic quant. + per_token_quant = True + else: + per_token_quant = ((self.block_shape is not None) or + (a1_scale is not None and a1_scale.numel() != 1) + or (a2_scale is not None + and a2_scale.numel() != 1)) + + if per_token_quant: + a1q, a1q_scale = self._do_quant(a1, a1_scale, per_act_token=True) + (expert_x, expert_x_scale, expert_num_tokens, expert_topk_ids, + expert_topk_weights) = self._do_dispatch( + tokens=a1q, + token_scales=a1q_scale, + rank_topk_ids=rank_topk_ids, + rank_topk_weights=rank_topk_weights, + num_experts=num_experts) + else: + # DeepEP kernels only support dispatching per-token-quant + # quantization. dispatch in bfloat16. + (expert_x, _, expert_num_tokens, expert_topk_ids, + expert_topk_weights) = self._do_dispatch( + tokens=a1, + token_scales=None, + rank_topk_ids=rank_topk_ids, + rank_topk_weights=rank_topk_weights, + num_experts=num_experts) + # quantize now + expert_x_scale = None + if expert_x.numel() != 0: + expert_x, expert_x_scale = self._do_quant(expert_x, + a1_scale, + per_act_token=False) + + return (expert_x, expert_x_scale, expert_num_tokens, expert_topk_ids, + expert_topk_weights) + + def _apply_weights_and_reduce(self, num_tokens: int, + fused_expert_output: torch.Tensor, + topk_weights: torch.Tensor, + apply_router_weight_on_input: bool, + output_dtype: torch.dtype): + + if fused_expert_output.ndim == 2: + hidden_dim = fused_expert_output.size(-1) + fused_expert_output = fused_expert_output.view( + num_tokens, -1, hidden_dim) + + if not apply_router_weight_on_input: + # The DeepEP combine kernels don't do the topk weight + # multiplication. We multiply the weights locally. + fused_expert_output = fused_expert_output.to(torch.float32) + fused_expert_output = fused_expert_output * topk_weights.view( + fused_expert_output.size(0), -1, 1) + fused_expert_output = fused_expert_output.to(output_dtype) + + return fused_expert_output.sum(dim=1).to(output_dtype) + + def finalize(self, output: torch.Tensor, fused_expert_output: torch.Tensor, + topk_weights: torch.Tensor, topk_ids: torch.Tensor, + apply_router_weight_on_input: bool) -> None: + + assert self.handle is not None + + # fused_expert_output can have 0 tokens - This happens when none of the + # tokens from the all2all reach this EP rank. + if fused_expert_output.numel() != 0: + fused_expert_output = self._apply_weights_and_reduce( + num_tokens=topk_ids.size(0), + fused_expert_output=fused_expert_output, + topk_weights=topk_weights, + apply_router_weight_on_input=apply_router_weight_on_input, + output_dtype=output.dtype) + + combined_x, _, event = self.buffer.combine( + x=fused_expert_output, + handle=self.handle, + topk_weights=None, + config=self._get_combine_config(), + previous_event=None, + async_finish=False, + allocate_on_comm_stream=False) + # Respect inplace outputs. + output.copy_(combined_x, non_blocking=True) diff --git a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py new file mode 100644 index 0000000000000..b9d817a14d57e --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py @@ -0,0 +1,152 @@ +# SPDX-License-Identifier: Apache-2.0 +from typing import Optional + +import deep_ep +import torch + +import vllm.model_executor.layers.fused_moe.modular_kernel as mk +from vllm.model_executor.layers.fused_moe.utils import ( + moe_kernel_quantize_input) + +# DeepEP kernels quantize dispatch inputs in 128 element chunks. +DEEPEP_QUANT_BLOCK_SIZE = 128 + + +def dequant_fp8(expert_x_fp8: torch.Tensor, + expert_x_scales: torch.Tensor) -> torch.Tensor: + """ + Return dequantized tensor in fp32 + """ + # TODO (varun) : Optimize leverage num_tokens_per_expert counts + assert expert_x_fp8.is_contiguous() + expert_x_scales = expert_x_scales.contiguous() + num_experts = expert_x_fp8.size(0) + + expert_x_fp32 = expert_x_fp8.to(torch.float32).view( + num_experts, -1, DEEPEP_QUANT_BLOCK_SIZE) + expert_x_scales = expert_x_scales.view(num_experts, -1, 1) + return (expert_x_fp32 * expert_x_scales).view(expert_x_fp8.shape) + + +class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): + """ + Prepare/Finalize using DeepEP low-latency kernels. + """ + + # DeepEP low-latency kernels are compiled only for certain + # specific hidden sizes. + SUPPORTED_HIDDEN_SIZES = [2560, 4096, 5120, 7168] + + def __init__(self, + buffer: deep_ep.Buffer, + world_size: int, + dp_size: int, + max_tokens_per_rank: int, + quant_dtype: Optional[torch.dtype] = None, + block_shape: Optional[list[int]] = None, + use_fp8_dispatch: bool = False): + super().__init__() + + self.buffer = buffer + self.world_size = world_size + self.dp_size = dp_size + self.quant_dtype = quant_dtype + self.block_shape = block_shape + self.max_tokens_per_rank = max_tokens_per_rank + self.use_fp8_dispatch = use_fp8_dispatch + # The dispatch function returns a handle that the combine function + # requires. We store the handle here so it is available to the + # combine function. + self.handle = None + + def max_num_tokens_per_rank(self) -> Optional[int]: + return self.max_tokens_per_rank + + def topk_indices_dtype(self) -> Optional[torch.dtype]: + return torch.int64 + + def prepare( + self, + a1: torch.Tensor, + a1_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], + rank_topk_weights: torch.Tensor, + rank_topk_ids: torch.Tensor, + num_experts: int, + expert_map: Optional[torch.Tensor], + apply_router_weight_on_input: bool, + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor], + Optional[torch.Tensor], Optional[torch.Tensor]]: + + hidden_size = a1.size(1) + assert hidden_size in self.SUPPORTED_HIDDEN_SIZES, \ + (f"Hidden Size {hidden_size} not in supported list of hidden sizes" + f"{self.SUPPORTED_HIDDEN_SIZES}") + + if self.use_fp8_dispatch: + assert hidden_size % 128 == 0, \ + "DeepEP kernels quantize the inputs in blocks of shape 128" + + # Quantize + per_act_token = a1_scale.numel() != 1 if a1_scale is not None else ( + a2_scale.numel() != 1 if a2_scale is not None else False) + assert not per_act_token, ( + "low_latency kernels don't support per-act-token quant") + + if apply_router_weight_on_input: + topk = rank_topk_ids.size(1) + # TODO: this only works for topK=1, will need to update for topK>1 + assert topk == 1, ( + "apply_router_weight_on_input is only implemented for topk=1") + a1 = a1 * rank_topk_weights.to(a1.dtype) + + # Dispatch + expert_x, expert_num_tokens, self.handle, event, hook = \ + self.buffer.low_latency_dispatch(a1, + rank_topk_ids, + self.max_tokens_per_rank, + num_experts, + use_fp8=self.use_fp8_dispatch, + async_finish=False, + return_recv_hook=False) + + if self.use_fp8_dispatch: + # TODO (varun) : In the case of dynamic quantization, we could + # probably skip the quant below and use the results directly. + # Although note that the deepep quant is per token 128 elements. + expert_x_fp8, expert_x_scales = expert_x + expert_x = dequant_fp8(expert_x_fp8, + expert_x_scales).to(dtype=a1.dtype) + + num_experts = expert_x.size(0) + hidden_dim = expert_x.size(-1) + + expert_x = expert_x.view((-1, expert_x.size(-1))) + expert_x, expert_x_scale = moe_kernel_quantize_input( + expert_x, a1_scale, self.quant_dtype, per_act_token, + self.block_shape) + expert_x = expert_x.view((num_experts, -1, hidden_dim)) + + return (expert_x, expert_x_scale, expert_num_tokens, None, None) + + def finalize(self, output: torch.Tensor, fused_expert_output: torch.Tensor, + topk_weights: torch.Tensor, topk_ids: torch.Tensor, + apply_router_weight_on_input: bool) -> None: + + assert self.handle is not None + + combine_topk_weights = topk_weights + if apply_router_weight_on_input: + # weights have already been applied. + combine_topk_weights = torch.ones_like(topk_weights) + + # TODO (varun) : Enable zero copy mode + _, event, hook = self.buffer.low_latency_combine( + fused_expert_output, + topk_ids, + combine_topk_weights, + self.handle, + async_finish=False, + zero_copy=False, + return_recv_hook=False, + out=output) diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py index 205a95e7ff1e4..7490a192df945 100644 --- a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py @@ -10,7 +10,8 @@ import triton.language as tl import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm.model_executor.layers.fused_moe.fused_moe import ( get_config_dtype_str, try_get_optimal_moe_config) -from vllm.model_executor.layers.fused_moe.utils import _resize_cache +from vllm.model_executor.layers.fused_moe.utils import ( + _resize_cache, moe_kernel_quantize_input) @triton.jit @@ -397,6 +398,12 @@ class BatchedPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): self.rank = rank self.max_num_tokens = max_num_tokens + def max_num_tokens_per_rank(self) -> Optional[int]: + return self.max_num_tokens + + def topk_indices_dtype(self) -> Optional[torch.dtype]: + return None + def prepare( self, a1: torch.Tensor, @@ -407,7 +414,8 @@ class BatchedPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): num_experts: int, expert_map: Optional[torch.Tensor], apply_router_weight_on_input: bool, - ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]: + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor], + Optional[torch.Tensor], Optional[torch.Tensor]]: assert a1.dim() == 2 assert topk_ids.dim() == 2 assert topk_ids.size(0) == a1.size(0) @@ -450,7 +458,7 @@ class BatchedPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): first_expert, :rows, :] = a1[:topks.numel()][topks] tokens_per_expert[expert_id - first_expert] = rows - return b_a1, a1_scale, tokens_per_expert + return b_a1, a1_scale, tokens_per_expert, None, None def finalize( self, @@ -601,6 +609,7 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute): use_int8_w8a8: bool = False, use_int8_w8a16: bool = False, use_int4_w4a16: bool = False, + per_channel_quant: bool = False, block_shape: Optional[list[int]] = None, world_size: int = 1, dp_size: int = 1, @@ -611,12 +620,15 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute): self.use_int4_w4a16 = use_int4_w4a16 self.use_int8_w8a16 = use_int8_w8a16 self.block_shape = block_shape + self.per_channel_quant = per_channel_quant self.max_num_tokens = max_num_tokens - assert not use_int8_w8a8, "NYI" - assert not use_int4_w4a16, "NYI" self.world_size = world_size self.dp_size = dp_size + assert not use_int8_w8a8, "NYI" + assert not use_int4_w4a16, "NYI" + assert self.block_shape is None, "NYI" + def workspace_shapes( self, a: torch.Tensor, @@ -670,8 +682,7 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute): torch.float32, torch.float16, torch.bfloat16, torch.float8_e4m3fn ] - # TODO: num_tokens -> max_num_tokens? - E, num_tokens, N, K, top_k_num = mk._moe_problem_size( + E, max_num_tokens, N, K, top_k_num = mk._moe_problem_size( hidden_states, w1, w2, topk_ids) assert w1.size(0) == E @@ -687,7 +698,7 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute): w2.size(), top_k_num, config_dtype, - num_tokens, + max_num_tokens, block_shape=self.block_shape, ) @@ -706,10 +717,12 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute): #print(f"shape: E={E}, M={num_tokens}, N={N}, K={K}, top_k={top_k_num}") # We can reuse the memory between these because by the time we need # cache3, we're done with cache1 - intermediate_cache1 = _resize_cache(workspace13, (E, num_tokens, N)) + intermediate_cache1 = _resize_cache(workspace13, + (E, max_num_tokens, N)) intermediate_cache2 = _resize_cache(workspace2, - (E, num_tokens, N // 2)) - intermediate_cache3 = _resize_cache(workspace13, (E, num_tokens, K)) + (E, max_num_tokens, N // 2)) + intermediate_cache3 = _resize_cache(workspace13, + (E, max_num_tokens, K)) # MM1 invoke_moe_batched_triton_kernel(A=hidden_states, @@ -731,15 +744,20 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute): self.activation(activation, intermediate_cache2.view(-1, N // 2), intermediate_cache1.view(-1, N)) - #qintermediate_cache2 = intermediate_cache2 - a2q_scale = a2_scale - # TODO (varun) : support w8a8 - assert not self.use_fp8_w8a8 - #if self.use_fp8_w8a8: - # qintermediate_cache2, a2q_scale = _fp8_quantize( - # intermediate_cache2, a2_scale, self.block_shape) + ic2_hidden_size = intermediate_cache2.size(-1) + intermediate_cache2 = intermediate_cache2.view(-1, ic2_hidden_size) - invoke_moe_batched_triton_kernel(A=intermediate_cache2, + qintermediate_cache2, a2q_scale = moe_kernel_quantize_input( + A=intermediate_cache2, + A_scale=a2_scale, + qtype=torch.float8_e4m3fn if self.use_fp8_w8a8 else None, + per_channel_quant=self.per_channel_quant, + block_shape=self.block_shape) + + qintermediate_cache2 = qintermediate_cache2.view( + (E, -1, ic2_hidden_size)) + + invoke_moe_batched_triton_kernel(A=qintermediate_cache2, B=w2, C=intermediate_cache3, expert_num_tokens=expert_num_tokens, @@ -752,5 +770,4 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute): use_int4_w4a16=self.use_int4_w4a16, config=config, block_shape=self.block_shape) - return intermediate_cache3 diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 883a48c984f21..de7a9a8d0b3bc 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -1164,7 +1164,7 @@ def fused_experts(hidden_states: torch.Tensor, # permute/unpermute ops are available. N = w1.shape[1] if (allow_deep_gemm and use_fp8_w8a8 and N > 512 - and _valid_deep_gemm(hidden_states, w1, w2, expert_map)): + and _valid_deep_gemm(hidden_states, w1, w2)): assert apply_router_weight_on_input is False return deep_gemm_moe_fp8( hidden_states=hidden_states, diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 3ce4cbc2838e9..1812f3b6759a4 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -5,7 +5,7 @@ import importlib from abc import abstractmethod from dataclasses import dataclass from enum import Enum -from typing import Callable, Optional +from typing import Callable, Optional, Union import torch import torch.nn.functional as F @@ -30,16 +30,19 @@ from vllm.platforms.interface import CpuArchEnum from vllm.utils import direct_register_custom_op has_pplx = importlib.util.find_spec("pplx_kernels") is not None +has_deepep = importlib.util.find_spec("deep_ep") is not None if current_platform.is_cuda_alike(): - from .fused_batched_moe import (BatchedPrepareAndFinalize, - BatchedTritonExperts) + from .fused_batched_moe import BatchedTritonExperts from .fused_moe import TritonExperts, fused_experts from .modular_kernel import (FusedMoEModularKernel, FusedMoEPermuteExpertsUnpermute, FusedMoEPrepareAndFinalize) if has_pplx: from .pplx_prepare_finalize import PplxPrepareAndFinalize + if has_deepep: + from .deepep_ht_prepare_finalize import DeepEPHTPrepareAndFinalize + from .deepep_ll_prepare_finalize import DeepEPLLPrepareAndFinalize else: fused_experts = None # type: ignore FusedMoEPermuteExpertsUnpermute = None # type: ignore @@ -71,10 +74,24 @@ class FusedMoEParallelConfig: use_ep: bool # whether to use EP or not + @property + def use_all2all_kernels(self): + return self.dp_size > 1 and self.use_ep + @property def use_pplx_kernels(self): - return self.dp_size > 1 and self.use_ep and \ - envs.VLLM_ALL2ALL_BACKEND == "pplx" + return (self.use_all2all_kernels + and envs.VLLM_ALL2ALL_BACKEND == "pplx") + + @property + def use_deepep_ht_kernels(self): + return (self.use_all2all_kernels + and envs.VLLM_ALL2ALL_BACKEND == "deepep_high_throughput") + + @property + def use_deepep_ll_kernels(self): + return (self.use_all2all_kernels + and envs.VLLM_ALL2ALL_BACKEND == "deepep_low_latency") @staticmethod def make(tp_size_: int, dp_size_: int, @@ -231,6 +248,14 @@ class MoEConfig: def use_pplx_kernels(self): return self.moe_parallel_config.use_pplx_kernels + @property + def use_deepep_ht_kernels(self): + return self.moe_parallel_config.use_deepep_ht_kernels + + @property + def use_deepep_ll_kernels(self): + return self.moe_parallel_config.use_deepep_ll_kernels + class FusedMoeWeightScaleSupported(Enum): TENSOR = "tensor" @@ -252,7 +277,16 @@ class FusedMoEMethodBase(QuantizeMethodBase): all2all_manager = get_ep_group().device_communicator.all2all_manager assert all2all_manager is not None - prepare_finalize = None + quant_dtype = None + act_quant_block_size = None + from vllm.model_executor.layers.quantization.fp8 import Fp8Config + if isinstance(quant_config, Fp8Config): + act_quant_block_size = quant_config.weight_block_size + quant_dtype = torch.float8_e4m3fn + + prepare_finalize: Optional[Union[PplxPrepareAndFinalize, + DeepEPHTPrepareAndFinalize, + DeepEPLLPrepareAndFinalize]] = None if moe.use_pplx_kernels: all_to_all_args = dict( max_num_tokens=moe.max_num_tokens, @@ -288,8 +322,49 @@ class FusedMoEMethodBase(QuantizeMethodBase): dp_size=all2all_manager.tp_group.world_size, quant_dtype=moe.in_dtype, ) + elif moe.use_deepep_ht_kernels: + assert moe.dp_size == all2all_manager.dp_world_size + all_to_all_args = dict() + handle = all2all_manager.get_handle(all_to_all_args) + prepare_finalize = DeepEPHTPrepareAndFinalize( + handle, + world_size=all2all_manager.world_size, + rank=all2all_manager.rank, + dp_size=all2all_manager.dp_world_size, + rank_expert_offset=all2all_manager.rank * + moe.num_local_experts, + quant_dtype=quant_dtype, + block_shape=act_quant_block_size, + ) + + elif moe.use_deepep_ll_kernels: + assert moe.dp_size == all2all_manager.dp_world_size + + all_to_all_args = dict( + max_num_tokens_per_dp_rank=moe.max_num_tokens, + token_hidden_size=moe.hidden_dim, + num_ep_ranks=all2all_manager.world_size, + num_global_experts=moe.num_experts, + num_local_experts=moe.num_experts // + all2all_manager.world_size) + handle = all2all_manager.get_handle(all_to_all_args) + + # Note (varun): Whether to use FP8 dispatch or not needs some + # profiling. Turning it off for now. + prepare_finalize = DeepEPLLPrepareAndFinalize( + handle, + world_size=all2all_manager.world_size, + dp_size=all2all_manager.dp_world_size, + max_tokens_per_rank=moe.max_num_tokens, + quant_dtype=quant_dtype, + block_shape=act_quant_block_size, + use_fp8_dispatch=False, + ) + + self.topk_indices_dtype = None if prepare_finalize is not None: + self.topk_indices_dtype = prepare_finalize.topk_indices_dtype() experts = self.select_gemm_impl(prepare_finalize) self.fused_experts = FusedMoEModularKernel( prepare_finalize, @@ -297,7 +372,7 @@ class FusedMoEMethodBase(QuantizeMethodBase): ) def select_gemm_impl( - self, prepare_finalize: Optional[FusedMoEPrepareAndFinalize] + self, prepare_finalize: FusedMoEPrepareAndFinalize ) -> FusedMoEPermuteExpertsUnpermute: # based on the all2all implementation, select the appropriate # gemm implementation @@ -334,6 +409,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): def __init__(self, moe: MoEConfig): super().__init__() self.fused_experts = fused_experts # type: ignore + self.topk_indices_dtype = None self.moe = moe self.rocm_aiter_moe_enabled = is_rocm_aiter_moe_enabled() @@ -343,8 +419,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): else: self.rocm_aiter_fused_experts = None # type: ignore - def select_gemm_impl( - self, prepare_finalize: Optional[FusedMoEPrepareAndFinalize]): + def select_gemm_impl(self, prepare_finalize: FusedMoEPrepareAndFinalize): assert self.fused_experts == fused_experts @@ -353,11 +428,13 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): experts: Optional[FusedMoEPermuteExpertsUnpermute] = None - if isinstance(prepare_finalize, - (BatchedPrepareAndFinalize, PplxPrepareAndFinalize)): + use_batched_experts = prepare_finalize.max_num_tokens_per_rank( + ) is not None + if use_batched_experts: logger.debug("BatchedTritonExperts %s", self.moe) + assert self.moe.dp_size == all2all_manager.dp_world_size experts = BatchedTritonExperts( - max_num_tokens=MOE_DP_CHUNK_SIZE, + max_num_tokens=self.moe.max_num_tokens, world_size=all2all_manager.world_size, # dp_size actually means tp_size, bug in pplx kernels dp_size=all2all_manager.tp_group.world_size, @@ -366,6 +443,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): use_int8_w8a16=False, use_int4_w4a16=False, block_shape=None, + per_channel_quant=False, ) else: logger.debug("TritonExperts %s", self.moe) @@ -494,6 +572,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): apply_router_weight_on_input: bool = False, activation: str = "silu", ) -> torch.Tensor: + topk_weights, topk_ids = FusedMoE.select_experts( hidden_states=x, router_logits=router_logits, @@ -505,7 +584,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): custom_routing_function=custom_routing_function, scoring_func=scoring_func, e_score_correction_bias=e_score_correction_bias, - indices_type=torch.uint32 if self.moe.use_pplx_kernels else None) + indices_type=self.topk_indices_dtype) if self.rocm_aiter_moe_enabled: assert expert_map is None @@ -806,11 +885,8 @@ class FusedMoE(torch.nn.Module): # Note: get_quant_method will look at the layer's local_num_experts # for heuristic purposes, so it must be initialized first. quant_method: Optional[QuantizeMethodBase] = None - - if quant_config is None: - quant_method = UnquantizedFusedMoEMethod(moe) - else: - quant_method = quant_config.get_quant_method(self, prefix) + quant_method = (UnquantizedFusedMoEMethod(moe) if quant_config is None + else quant_config.get_quant_method(self, prefix)) assert quant_method is not None assert isinstance(quant_method, FusedMoEMethodBase) @@ -836,7 +912,8 @@ class FusedMoE(torch.nn.Module): # Chunked all2all staging tensor self.batched_hidden_states: Optional[torch.Tensor] = None self.batched_router_logits: Optional[torch.Tensor] = None - if self.moe_parallel_config.use_pplx_kernels: + if (self.moe_parallel_config.use_pplx_kernels + or self.moe_parallel_config.use_deepep_ll_kernels): act_dtype = vllm_config.model_config.dtype self.batched_hidden_states = torch.zeros( (MOE_DP_CHUNK_SIZE, self.hidden_size), @@ -880,6 +957,14 @@ class FusedMoE(torch.nn.Module): def use_pplx_kernels(self): return self.moe_parallel_config.use_pplx_kernels + @property + def use_deepep_ht_kernels(self): + return self.moe_parallel_config.use_deepep_ht_kernels + + @property + def use_deepep_ll_kernels(self): + return self.moe_parallel_config.use_deepep_ll_kernels + def _load_per_tensor_weight_scale(self, shard_id: str, param: torch.nn.Parameter, loaded_weight: torch.Tensor, @@ -1210,19 +1295,21 @@ class FusedMoE(torch.nn.Module): When just tensor-parallel is used, it is not required to reduce the shared_experts results immediately. Instead we reduce at the once at the end of the MoE op. (Refer to DeepSeekV2MoE module) - With EP and the pplx kernels - this is no longer viable as all + With EP and all2all kernels - this is no longer viable as all GPU ranks in DP, produce the complete set of hidden_states. Therefore it is required that we reduce the shared_experts output early. """ - return self.use_pplx_kernels + return (self.use_pplx_kernels or self.use_deepep_ht_kernels + or self.use_deepep_ll_kernels) def maybe_all_reduce_tensor_model_parallel( self, final_hidden_states: torch.Tensor): """ The pplx combine kernel reduces across GPU ranks by default. """ - if self.use_pplx_kernels: + if (self.use_pplx_kernels or self.use_deepep_ht_kernels + or self.use_deepep_ll_kernels): return final_hidden_states else: return tensor_model_parallel_all_reduce(final_hidden_states) @@ -1289,7 +1376,7 @@ class FusedMoE(torch.nn.Module): ctx = get_forward_context() max_tokens_across_dp = ctx.dp_metadata.max_tokens_across_dp_cpu - moe_dp_chunk_size_per_rank = MOE_DP_CHUNK_SIZE + moe_dp_chunk_size_per_rank = self.moe_config.max_num_tokens num_tokens = full_hidden_states.size(0) for chunk_start_ in range(0, max_tokens_across_dp, @@ -1310,12 +1397,17 @@ class FusedMoE(torch.nn.Module): def forward_impl(self, hidden_states: torch.Tensor, router_logits: torch.Tensor): assert self.quant_method is not None - if self.moe_parallel_config.use_pplx_kernels: + if (self.moe_parallel_config.use_pplx_kernels + or self.moe_parallel_config.use_deepep_ll_kernels): return self.forward_impl_chunked(hidden_states, router_logits) - if self.dp_size > 1: + do_naive_dispatch_combine: bool = ( + self.dp_size > 1 + and not self.moe_parallel_config.use_deepep_ht_kernels) + if do_naive_dispatch_combine: hidden_states, router_logits = get_ep_group().dispatch( hidden_states, router_logits) + # Matrix multiply. final_hidden_states = self.quant_method.apply( layer=self, @@ -1335,12 +1427,12 @@ class FusedMoE(torch.nn.Module): apply_router_weight_on_input=self.apply_router_weight_on_input, ) - if self.dp_size > 1: + if do_naive_dispatch_combine: final_hidden_states = get_ep_group().combine(final_hidden_states) if self.reduce_results and (self.tp_size > 1 or self.ep_size > 1): - # Default set to False. (May have to add shared expert outputs.) - final_hidden_states = tensor_model_parallel_all_reduce( + # Default set to False. (May have to add shared expert outputs. + final_hidden_states = self.maybe_all_reduce_tensor_model_parallel( final_hidden_states) return final_hidden_states diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py index 5e321c9b43af7..2c27d31eb6eb9 100644 --- a/vllm/model_executor/layers/fused_moe/modular_kernel.py +++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py @@ -94,7 +94,8 @@ class FusedMoEPrepareAndFinalize(ABC): num_experts: int, expert_map: Optional[torch.Tensor], apply_router_weight_on_input: bool, - ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]: + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor], + Optional[torch.Tensor], Optional[torch.Tensor]]: """ Perform any quantization (and/or) dispatching needed for this kernel. @@ -113,6 +114,10 @@ class FusedMoEPrepareAndFinalize(ABC): Returns a tuple of: - quantized + dispatched a. - quantized + dispatched a1_scales. + - Optional tensor as big as number of local experts that contains the + number of tokens assigned to each local expert. + - Optional dispatched expert topk IDs + - Optional dispatched expert topk weight """ raise NotImplementedError @@ -138,6 +143,27 @@ class FusedMoEPrepareAndFinalize(ABC): """ raise NotImplementedError + @abstractmethod + def topk_indices_dtype(self) -> Optional[torch.dtype]: + """ + The PrepareFinalize All2All implementations generally constrain the + dtype of the topk_ids they support. This function returns the + required topk indices dtype so it can be respected. + Return None if there are no such restrictions. + """ + raise NotImplementedError + + @abstractmethod + def max_num_tokens_per_rank(self) -> Optional[int]: + """ + Some PrepareFinalize All2All implementations are batched. Meaning, + they can processes only as set of tokens at a time. This + function returns the batch size i.e the maximum number of tokens + the implementation can process at a time. + Return None if there are no such restrictions. + """ + raise NotImplementedError + class FusedMoEPermuteExpertsUnpermute(ABC): """ @@ -261,6 +287,61 @@ class FusedMoEModularKernel(torch.nn.Module): self.prepare_finalize = prepare_finalize self.fused_experts = fused_experts + def _do_fused_experts( + self, + a1: torch.Tensor, # input to forward fn + a1q: torch.Tensor, # output of prepare fn + w1: torch.Tensor, + w2: torch.Tensor, + topk_ids: torch.Tensor, + expert_num_tokens: torch.Tensor, + activation: str, + global_num_experts: int, + expert_map: Optional[torch.Tensor], + w1_scale: Optional[torch.Tensor], + w2_scale: Optional[torch.Tensor], + w1_zp: Optional[torch.Tensor], + w2_zp: Optional[torch.Tensor], + a1q_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor]) -> torch.Tensor: + + _, M, N, K, top_k = _moe_problem_size(a1q, w1, w2, topk_ids) + + # Use a1 here to decipher the correct workspace datatype + workspace13_shape, workspace2_shape, workspace_dtype = ( + self.fused_experts.workspace_shapes(a1, M, N, K, top_k, + global_num_experts)) + + # We can reuse the memory between cache1 and cache3 because by the time + # we need cache3, we're done with cache1 + workspace13 = torch.zeros(workspace13_shape, + device=a1.device, + dtype=workspace_dtype) + workspace2 = torch.zeros(workspace2_shape, + device=a1.device, + dtype=workspace_dtype) + + fused_out = self.fused_experts.apply( + a1q, + w1, + w2, + topk_ids, + activation=activation, + global_num_experts=global_num_experts, + expert_map=expert_map, + w1_scale=w1_scale, + w2_scale=w2_scale, + w1_zp=w1_zp, + w2_zp=w2_zp, + a1q_scale=a1q_scale, + a2_scale=a2_scale, + workspace13=workspace13, + workspace2=workspace2, + expert_num_tokens=expert_num_tokens, + ) + + return fused_out + def forward( self, hidden_states: torch.Tensor, @@ -315,49 +396,48 @@ class FusedMoEModularKernel(torch.nn.Module): Returns: - torch.Tensor: The output tensor after applying the MoE layer. """ + a1 = hidden_states - E, M, N, K, top_k = _moe_problem_size(a1, w1, w2, topk_ids) - - if global_num_experts == -1: - global_num_experts = E - output = a1 if inplace else torch.zeros_like(a1) - workspace13_shape, workspace2_shape, workspace_dtype = ( - self.fused_experts.workspace_shapes(a1, M, N, K, top_k, - global_num_experts)) + if global_num_experts == -1: + global_num_experts = w1.size(0) - # We can reuse the memory between cache1 and cache3 because by the time - # we need cache3, we're done with cache1 - workspace13 = torch.zeros(workspace13_shape, - device=a1.device, - dtype=workspace_dtype) - workspace2 = torch.zeros(workspace2_shape, - device=a1.device, - dtype=workspace_dtype) + (a1q, a1q_scale, expert_num_tokens, _expert_topk_ids, + _expert_topk_weights) = self.prepare_finalize.prepare( + a1, a1_scale, a2_scale, topk_weights, topk_ids, + global_num_experts, expert_map, apply_router_weight_on_input) + # Maybe prepare gathered topk_ids and topk_weights from other EP ranks. + topk_ids = topk_ids if _expert_topk_ids is None else _expert_topk_ids + topk_weights = (topk_weights if _expert_topk_weights is None else + _expert_topk_weights) - a1q, a1q_scale, expert_num_tokens = self.prepare_finalize.prepare( - a1, a1_scale, a2_scale, topk_weights, topk_ids, global_num_experts, - expert_map, apply_router_weight_on_input) - - fused_out = self.fused_experts.apply( - a1q, - w1, - w2, - topk_ids, - activation=activation, - global_num_experts=global_num_experts, - expert_map=expert_map, - w1_scale=w1_scale, - w2_scale=w2_scale, - w1_zp=w1_zp, - w2_zp=w2_zp, - a1q_scale=a1q_scale, - a2_scale=a2_scale, - workspace13=workspace13, - workspace2=workspace2, - expert_num_tokens=expert_num_tokens, - ) + fused_out = None + if a1q.numel() == 0: + # This happens when none of the tokens from the all2all reach this + # EP rank. Also, note that this is only relevant for CUDAGraph + # incompatible all2all kernels like the DeepEP high-throughput + # kernels. CUDAGraph compatible all2all kernels like the pplx + # kernels and the DeepEP low-latency kernels are always batched + # and can never run into the tensor.numel() == 0 case. + fused_out = torch.empty_like(a1q).to(dtype=a1.dtype) + else: + fused_out = self._do_fused_experts( + a1=a1, + a1q=a1q, + w1=w1, + w2=w2, + topk_ids=topk_ids, + expert_num_tokens=expert_num_tokens, + activation=activation, + global_num_experts=global_num_experts, + expert_map=expert_map, + w1_scale=w1_scale, + w2_scale=w2_scale, + w1_zp=w1_zp, + w2_zp=w2_zp, + a1q_scale=a1q_scale, + a2_scale=a2_scale) self.prepare_finalize.finalize(output, fused_out, topk_weights, topk_ids, apply_router_weight_on_input) diff --git a/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py b/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py index da78714341513..89481e5bd6b0a 100644 --- a/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py +++ b/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py @@ -25,7 +25,7 @@ def _moe_permute( """ top_k_num = curr_topk_ids.size(1) - tokens_in_chunk = curr_hidden_states.sizze(0) + tokens_in_chunk = curr_hidden_states.size(0) sorted_token_ids, expert_ids, num_tokens_post_padded = ( moe_align_block_size(curr_topk_ids, @@ -37,11 +37,12 @@ def _moe_permute( inv_perm: Optional[torch.Tensor] = None num_tokens = top_k_num * tokens_in_chunk - sorted_token_ids = sorted_token_ids.clamp(max=num_tokens - 1) expert_ids = torch.repeat_interleave(expert_ids, block_m, dim=0) inv_perm = torch.argsort(sorted_token_ids)[:num_tokens] # Permute according to sorted token ids. + sorted_token_ids = sorted_token_ids.clamp(max=num_tokens - 1) + curr_hidden_states = _fp8_perm(curr_hidden_states, sorted_token_ids // top_k_num) diff --git a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py index 8405603cf28a0..1170a16f3de2f 100644 --- a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py @@ -32,6 +32,12 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): self.dp_size = dp_size self.quant_dtype = quant_dtype + def max_num_tokens_per_rank(self) -> Optional[int]: + return self.max_num_tokens + + def topk_indices_dtype(self) -> Optional[torch.dtype]: + return torch.uint32 + def prepare( self, a1: torch.Tensor, @@ -42,7 +48,8 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): num_experts: int, expert_map: Optional[torch.Tensor], apply_router_weight_on_input: bool, - ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]: + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor], + Optional[torch.Tensor], Optional[torch.Tensor]]: num_tokens = a1.size(0) # M hidden_dim = a1.size(-1) # K @@ -115,7 +122,7 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): bound_m=bound_m, ) - return expert_x, expert_x_scale, expert_num_tokens + return expert_x, expert_x_scale, expert_num_tokens, None, None def finalize( self, diff --git a/vllm/model_executor/layers/fused_moe/prepare_finalize.py b/vllm/model_executor/layers/fused_moe/prepare_finalize.py index 77a9686c93a63..9ed95e1de9fed 100644 --- a/vllm/model_executor/layers/fused_moe/prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/prepare_finalize.py @@ -24,6 +24,12 @@ class MoEPrepareAndFinalizeNoEP(mk.FusedMoEPrepareAndFinalize): self.block_shape = block_shape self.quant_dtype = quant_dtype + def max_num_tokens_per_rank(self) -> Optional[int]: + return None + + def topk_indices_dtype(self) -> Optional[torch.dtype]: + return None + def prepare( self, a1: torch.Tensor, @@ -34,7 +40,9 @@ class MoEPrepareAndFinalizeNoEP(mk.FusedMoEPrepareAndFinalize): num_experts: int, expert_map: Optional[torch.Tensor], apply_router_weight_on_input: bool = False, - ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]: + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor], + Optional[torch.Tensor], Optional[torch.Tensor]]: + if apply_router_weight_on_input: topk = topk_ids.size(1) # TODO: this only works for topK=1, will need to update for topK>1 @@ -47,7 +55,7 @@ class MoEPrepareAndFinalizeNoEP(mk.FusedMoEPrepareAndFinalize): self.per_channel_quant, self.block_shape) - return a1q, a1q_scale, None + return a1q, a1q_scale, None, None, None def finalize( self, diff --git a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py index 373e8ab396bc3..920931a93d3e8 100644 --- a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py +++ b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py @@ -29,9 +29,10 @@ class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): per_channel_quant=per_channel_quant, block_shape=block_shape, block_m=block_m) - self.deep_gemm_expert = DeepGemmExperts() self.allow_deep_gemm = allow_deep_gemm self.use_fp8_w8a8 = use_fp8_w8a8 + self.deep_gemm_expert = DeepGemmExperts( + ) if self.allow_deep_gemm else None def workspace_shapes( self, @@ -46,6 +47,7 @@ class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): # workspaces so we can be pessimistic here and allocate for DeepGemm # even if we fall back to triton later, e.g. if expert maps are set. if self.allow_deep_gemm and _valid_deep_gemm_shape(M, N, K): + assert self.deep_gemm_expert is not None return self.deep_gemm_expert.workspace_shapes( a, M, N, K, topk, num_experts) else: @@ -73,7 +75,8 @@ class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): ) -> torch.Tensor: N = w1.size(1) if (self.allow_deep_gemm and self.use_fp8_w8a8 and N > 512 - and _valid_deep_gemm(hidden_states, w1, w2, expert_map)): + and _valid_deep_gemm(hidden_states, w1, w2)): + assert self.deep_gemm_expert is not None return self.deep_gemm_expert.apply( hidden_states, w1, diff --git a/vllm/model_executor/layers/fused_moe/utils.py b/vllm/model_executor/layers/fused_moe/utils.py index c3a58478247a7..692482c2ea692 100644 --- a/vllm/model_executor/layers/fused_moe/utils.py +++ b/vllm/model_executor/layers/fused_moe/utils.py @@ -18,8 +18,8 @@ def _resize_cache(x: torch.Tensor, v: tuple[int, ...]) -> torch.Tensor: Shrink the given tensor and apply the given view to it. This is used to resize the intermediate fused_moe caches. """ - assert prod( - v) <= x.numel(), f"{prod(v)} <= {x.numel()}" # CUDAGRAPH unfriendly? + assert prod(v) <= x.numel( + ), f"{v} ({prod(v)}) <= {x.shape} ({x.numel()})" # CUDAGRAPH unfriendly? return x.flatten()[:prod(v)].view(*v) diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index cea4d26a4c48f..2438ec30bdd2b 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -3,7 +3,7 @@ import functools import importlib.util -from typing import Any, Callable, Optional +from typing import Any, Callable, Optional, Union import torch import torch.nn.functional as F @@ -452,6 +452,9 @@ class Fp8MoEMethod(FusedMoEMethodBase): if envs.VLLM_USE_DEEP_GEMM: if not has_deep_gemm: logger.warning_once("Failed to import DeepGemm kernels.") + elif not self.block_quant: + logger.warning_once("Model is not block quantized. Not using " + " DeepGemm kernels") elif (current_platform.is_cuda() and current_platform.has_device_capability(90)): logger.info_once("Using DeepGemm kernels for Fp8MoEMethod.") @@ -460,8 +463,10 @@ class Fp8MoEMethod(FusedMoEMethodBase): logger.warning_once( "DeepGemm not supported on the current platform.") + self.topk_indices_dtype = None self.fused_experts = functools.partial( # type: ignore fused_experts, + use_fp8_w8a8=True, block_shape=self.quant_config.weight_block_size, allow_deep_gemm=self.allow_deep_gemm) @@ -765,18 +770,39 @@ class Fp8MoEMethod(FusedMoEMethodBase): del layer.w2_input_scale def select_gemm_impl(self, prepare_finalize): + + from vllm.model_executor.layers.fused_moe.fused_batched_moe import ( + BatchedTritonExperts) from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import ( TritonOrDeepGemmExperts) assert not self.use_marlin and not self.rocm_aiter_moe_enabled, ( "Marlin and ROCm AITER are not supported with all2all yet.") - experts = TritonOrDeepGemmExperts( - use_fp8_w8a8=True, - block_shape=self.quant_config.weight_block_size, - allow_deep_gemm=self.allow_deep_gemm, - ) + experts: Optional[Union[BatchedTritonExperts, + TritonOrDeepGemmExperts]] = None + max_num_tokens_per_rank = prepare_finalize.max_num_tokens_per_rank() + use_batched_experts = max_num_tokens_per_rank is not None + if use_batched_experts: + experts = BatchedTritonExperts( + max_num_tokens=max_num_tokens_per_rank, + world_size=prepare_finalize.world_size, + dp_size=prepare_finalize.dp_size, + use_fp8_w8a8=True, + use_int8_w8a8=False, + use_int8_w8a16=False, + use_int4_w4a16=False, + block_shape=None, + ) + else: + experts = TritonOrDeepGemmExperts( + use_fp8_w8a8=True, + block_shape=self.quant_config.weight_block_size, + allow_deep_gemm=self.allow_deep_gemm, + ) + + assert experts is not None return experts def apply( @@ -797,6 +823,7 @@ class Fp8MoEMethod(FusedMoEMethodBase): apply_router_weight_on_input: bool = False, activation: str = "silu", ) -> torch.Tensor: + topk_weights, topk_ids = FusedMoE.select_experts( hidden_states=x, router_logits=router_logits, @@ -808,6 +835,7 @@ class Fp8MoEMethod(FusedMoEMethodBase): custom_routing_function=custom_routing_function, scoring_func=scoring_func, e_score_correction_bias=e_score_correction_bias, + indices_type=self.topk_indices_dtype, ) if self.rocm_aiter_moe_enabled: @@ -855,7 +883,6 @@ class Fp8MoEMethod(FusedMoEMethodBase): topk_ids=topk_ids, inplace=True, activation=activation, - use_fp8_w8a8=True, global_num_experts=global_num_experts, apply_router_weight_on_input=apply_router_weight_on_input, expert_map=expert_map, diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index e2d9424dee280..07ae470fabfb8 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -154,6 +154,21 @@ class CudaPlatformBase(Platform): logger.info( "Forcing kv cache block size to 64 for FlashMLA backend.") + if (envs.VLLM_ALL2ALL_BACKEND == "deepep_high_throughput" + and parallel_config.data_parallel_size > 1 + and vllm_config.compilation_config.use_cudagraph): + logger.info( + "Data Parallel: Forcing enforce eager to be True since DP " + "with DeepEP high-throughput kernels are not CUDA Graph " + "compatible. The DeepEP low-latency kernels are CUDA Graph " + "compatible. Set the all_to_all backend to deepep_low_latency " + "to use those kernels instead.") + vllm_config.compilation_config.use_cudagraph = False + vllm_config.model_config.enforce_eager = True + # TODO (varun): Turning this ON gives incorrect results for the + # Deepseek-V2-lite model. + vllm_config.compilation_config.use_inductor = False + @classmethod def get_current_memory_usage(cls, device: Optional[torch.types.Device] = None From bdf13965ab4a528d30cb82854487910189865d9d Mon Sep 17 00:00:00 2001 From: Yong Hoon Shin <48474650+sarckk@users.noreply.github.com> Date: Tue, 3 Jun 2025 13:33:07 -0700 Subject: [PATCH 029/115] [V1] Support cross-layer KV sharing (#18212) Signed-off-by: Yong Hoon Shin --- tests/v1/tpu/worker/test_tpu_model_runner.py | 227 +++++++++++++++- tests/v1/worker/test_gpu_model_runner.py | 244 +++++++++++++++++- vllm/attention/backends/abstract.py | 1 + vllm/attention/backends/blocksparse_attn.py | 3 + vllm/attention/backends/cpu_mla.py | 3 +- .../backends/dual_chunk_flash_attn.py | 3 + vllm/attention/backends/flash_attn.py | 3 + vllm/attention/backends/flashinfer.py | 3 + vllm/attention/backends/flashmla.py | 3 +- vllm/attention/backends/hpu_attn.py | 3 + vllm/attention/backends/ipex_attn.py | 3 + vllm/attention/backends/mla/common.py | 3 + vllm/attention/backends/pallas.py | 3 + vllm/attention/backends/rocm_aiter_mla.py | 3 +- vllm/attention/backends/rocm_flash_attn.py | 3 + vllm/attention/backends/torch_sdpa.py | 3 + vllm/attention/backends/triton_mla.py | 3 +- vllm/attention/backends/xformers.py | 3 + vllm/attention/layer.py | 17 +- vllm/v1/attention/backends/flash_attn.py | 36 +-- vllm/v1/attention/backends/flashinfer.py | 36 +-- vllm/v1/attention/backends/mla/common.py | 4 + vllm/v1/attention/backends/mla/flashmla.py | 3 +- .../attention/backends/mla/rocm_aiter_mla.py | 3 +- vllm/v1/attention/backends/mla/triton_mla.py | 3 +- vllm/v1/attention/backends/pallas.py | 6 +- vllm/v1/attention/backends/triton_attn.py | 51 ++-- vllm/v1/attention/backends/utils.py | 33 +++ vllm/v1/worker/gpu_model_runner.py | 31 ++- vllm/v1/worker/tpu_model_runner.py | 30 ++- vllm/v1/worker/utils.py | 36 +++ 31 files changed, 733 insertions(+), 73 deletions(-) diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py index 230c97e787a98..bc54b6ecc749e 100644 --- a/tests/v1/tpu/worker/test_tpu_model_runner.py +++ b/tests/v1/tpu/worker/test_tpu_model_runner.py @@ -4,8 +4,13 @@ import unittest.mock as mock import pytest -from vllm.config import CacheConfig, ModelConfig, SchedulerConfig, VllmConfig +from vllm.attention.layer import Attention +from vllm.config import (CacheConfig, ModelConfig, SchedulerConfig, VllmConfig, + set_current_vllm_config) from vllm.sampling_params import SamplingParams +from vllm.utils import GiB_bytes +from vllm.v1.core.kv_cache_utils import (estimate_max_model_len, + get_kv_cache_config) from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData, SchedulerOutput) from vllm.v1.worker.tpu_model_runner import ( @@ -363,3 +368,223 @@ def test_get_req_paddings(): assert _get_req_paddings(1, 32) == [8, 16, 32] assert _get_req_paddings(8, 32) == [8, 16, 32] assert _get_req_paddings(8, 36) == [8, 16, 32, 36] + + +def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order(): + layer_0 = "model.layers.0.self_attn.attn" + layer_1 = "model.layers.1.self_attn.attn" + error_msg = f"{layer_1} must come before the current layer" + with pytest.raises(ValueError, match=error_msg): + fwd_context = { + # initialization below will fail because target layer is invalid; + # the target layer needs to come before layer 1 + layer_0: + Attention( + num_heads=8, + head_size=64, + scale=1.0, + prefix=layer_0, + kv_sharing_target_layer_name=layer_1, + ), + layer_1: + Attention( + num_heads=8, + head_size=64, + scale=1.0, + prefix=layer_1, + ) + } + # suppress var not used error + assert fwd_context is not None + + +def test_init_kv_cache_with_kv_sharing_target_layer_not_exist(): + layer_0 = "model.layers.0.self_attn.attn" + layer_1 = "model.layers.1.self_attn.attn" + invalid_layer = "model.layers.0.cross_attn.attn" + error_msg = f"{invalid_layer} is not a valid Attention layer in the model" + with pytest.raises(ValueError, match=error_msg): + fwd_context = { + layer_0: + Attention( + num_heads=8, + head_size=64, + scale=1.0, + prefix=layer_0, + ), + layer_1: + Attention( + num_heads=8, + head_size=64, + scale=1.0, + prefix=layer_1, + # invalid layer: cross_attn.atn doesn't exist! + kv_sharing_target_layer_name=invalid_layer, + ) + } + # suppress var not used error + assert fwd_context is not None + + +def test_init_kv_cache_with_kv_sharing_target_same_as_current(): + layer_0 = "model.layers.0.self_attn.attn" + layer_1 = "model.layers.1.self_attn.attn" + error_msg = f"{layer_1} cannot be the same as the current layer" + with pytest.raises(ValueError, match=error_msg): + fwd_context = { + # initialization below will fail because target layer is invalid; + # the target layer needs to come before layer 1 + layer_0: + Attention( + num_heads=8, + head_size=64, + scale=1.0, + prefix=layer_0, + ), + layer_1: + Attention( + num_heads=8, + head_size=64, + scale=1.0, + prefix=layer_1, + kv_sharing_target_layer_name=layer_1, + ) + } + # suppress var not used error + assert fwd_context is not None + + +def test_init_kv_cache_without_kv_sharing(model_runner): + layer_0 = "model.layers.0.self_attn.attn" + layer_1 = "model.layers.1.self_attn.attn" + vllm_config = model_runner.vllm_config + with set_current_vllm_config(vllm_config): + fwd_context = { + layer_0: + Attention( + num_heads=8, + head_size=64, + scale=1.0, + prefix=layer_0, + ), + layer_1: + Attention( + num_heads=8, + head_size=64, + scale=1.0, + prefix=layer_1, + ) + } + # suppress var not used error + assert fwd_context is not None + # Set high context length to test max context length estimation + vllm_config.model_config.max_model_len = 3_000_000 + vllm_ctx = vllm_config.compilation_config.static_forward_context + kv_cache_spec = model_runner.get_kv_cache_spec() + assert len(kv_cache_spec) == 2 + assert len(model_runner.shared_kv_cache_layers) == 0 + + available_memory = 20 * GiB_bytes + # page size for layer 0's kv_cache_spec is 32KB + num_expected_blocks = 327680 # 20GB / 32KB / 2 (num layers) + kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec, + available_memory) + assert kv_cache_config.num_blocks == num_expected_blocks + assert len(kv_cache_config.tensors) == 2 + assert kv_cache_config.tensors[layer_0].size == available_memory // 2 + assert kv_cache_config.tensors[layer_1].size == available_memory // 2 + + max_context_len =\ + estimate_max_model_len(vllm_config, kv_cache_spec, 5 * GiB_bytes) + # max context len with KV sharing should be 2x as large as without + assert max_context_len == 1310720 + + # important: override tensor size to prevent large mem alloc during test + # this will only allocate 2 block worth of memory (2 * 32kb) + kv_cache_config.num_blocks = 1 + for layer in kv_cache_config.tensors: + kv_cache_config.tensors[layer].size =\ + kv_cache_spec[layer].page_size_bytes + + model_runner.initialize_kv_cache(kv_cache_config) + + layer_0_kv = vllm_ctx[layer_0].kv_cache[0] + layer_1_kv = vllm_ctx[layer_1].kv_cache[0] + # check layer 1 kv cache does NOT share memory with layer 0 + assert id(layer_1_kv) != id(layer_0_kv) + + # check layer 1 added to kv cache group's layer names + assert len(kv_cache_config.kv_cache_groups) == 1 + assert len(kv_cache_config.kv_cache_groups[0].layer_names) == 2 + assert kv_cache_config.kv_cache_groups[0].layer_names[0] == layer_0 + assert kv_cache_config.kv_cache_groups[0].layer_names[1] == layer_1 + + +def test_init_kv_cache_with_kv_sharing_valid(model_runner): + layer_0 = "model.layers.0.self_attn.attn" + layer_1 = "model.layers.1.self_attn.attn" + vllm_config = model_runner.vllm_config + with set_current_vllm_config(vllm_config): + fwd_context = { + layer_0: + Attention( + num_heads=8, + head_size=64, + scale=1.0, + prefix=layer_0, + ), + layer_1: + Attention( + num_heads=8, + head_size=64, + scale=1.0, + prefix=layer_1, + kv_sharing_target_layer_name="model.layers.0.self_attn.attn", + ) + } + # suppress var not used error + assert fwd_context is not None + # Set high context length to test max context length estimation + vllm_config.model_config.max_model_len = 3_000_000 + vllm_ctx = vllm_config.compilation_config.static_forward_context + kv_cache_spec = model_runner.get_kv_cache_spec() + assert len(kv_cache_spec) == 1 + assert layer_0 in kv_cache_spec + assert model_runner.shared_kv_cache_layers[layer_1] == layer_0 + + available_memory = 20 * GiB_bytes + # page size for layer 0's kv_cache_spec is 32KB + # with KV sharing, we can allocate (available_mem//page_size//1) blocks + # which is twice as many as without KV sharing + num_expected_blocks = 655360 # 20GB / 32KB + kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec, + available_memory) + assert kv_cache_config.num_blocks == num_expected_blocks + assert len(kv_cache_config.tensors) == 1 + # Each layer now has twice the available memory for KV cache + # compared to no KV sharing + assert kv_cache_config.tensors[layer_0].size == available_memory + + max_context_len =\ + estimate_max_model_len(vllm_config, kv_cache_spec, 5 * GiB_bytes) + # max context len with KV sharing should be 2x as large as without + assert max_context_len == 2 * 1310720 + + # important: override tensor size to prevent large mem alloc during test + # this will only allocate 1 block worth of memory (32kb) + kv_cache_config.num_blocks = 1 + kv_cache_config.tensors[layer_0].size =\ + kv_cache_spec[layer_0].page_size_bytes + + model_runner.initialize_kv_cache(kv_cache_config) + + layer_0_kv = vllm_ctx[layer_0].kv_cache[0] + layer_1_kv = vllm_ctx[layer_1].kv_cache[0] + # check layer 1 kv cache shares memory with layer 0 + assert id(layer_1_kv) == id(layer_0_kv) + + # check layer 1 added to kv cache group's layer names + assert len(kv_cache_config.kv_cache_groups) == 1 + assert len(kv_cache_config.kv_cache_groups[0].layer_names) == 2 + assert kv_cache_config.kv_cache_groups[0].layer_names[0] == layer_0 + assert kv_cache_config.kv_cache_groups[0].layer_names[1] == layer_1 diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py index ceb9d4df25e62..5e2fd2fbf747b 100644 --- a/tests/v1/worker/test_gpu_model_runner.py +++ b/tests/v1/worker/test_gpu_model_runner.py @@ -7,8 +7,11 @@ import pytest from vllm.attention import Attention from vllm.config import (CacheConfig, ModelConfig, ParallelConfig, - SchedulerConfig, VllmConfig) + SchedulerConfig, VllmConfig, set_current_vllm_config) from vllm.sampling_params import SamplingParams +from vllm.utils import GiB_bytes +from vllm.v1.core.kv_cache_utils import (estimate_max_model_len, + get_kv_cache_config) from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData, SchedulerOutput) from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, @@ -19,6 +22,7 @@ from vllm.v1.worker.gpu_model_runner import GPUModelRunner BLOCK_SIZE = 16 NUM_BLOCKS = 10 +DEVICE = "cuda" def initialize_kv_cache(runner: GPUModelRunner): @@ -55,8 +59,7 @@ def initialize_kv_cache(runner: GPUModelRunner): runner.initialize_attn_backend(kv_cache_config) -@pytest.fixture -def model_runner(): +def get_vllm_config(): scheduler_config = SchedulerConfig( max_num_seqs=10, max_num_batched_tokens=512, @@ -84,13 +87,18 @@ def model_runner(): scheduler_config=scheduler_config, parallel_config=parallel_config, ) - num_heads = model_config.get_num_kv_heads(parallel_config) + return vllm_config + + +@pytest.fixture +def model_runner(): + vllm_config = get_vllm_config() + model_config = vllm_config.model_config + num_heads = model_config.get_num_kv_heads(vllm_config.parallel_config) head_size = model_config.get_head_size() vllm_config.compilation_config.static_forward_context[ "layer.0"] = Attention(num_heads, head_size, 0.1) - - device = "cuda" - runner = GPUModelRunner(vllm_config, device) + runner = GPUModelRunner(vllm_config, DEVICE) initialize_kv_cache(runner) return runner @@ -385,3 +393,225 @@ def test_load_model_weights_inplace(dist_init, model_runner, model_runner_2): model_runner_2.load_model() # Load real weights inplace assert str(model_runner.get_model().state_dict()) == str( model_runner_2.get_model().state_dict()) + + +def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order(): + layer_0 = "model.layers.0.self_attn.attn" + layer_1 = "model.layers.1.self_attn.attn" + error_msg = f"{layer_1} must come before the current layer" + with pytest.raises(ValueError, match=error_msg): + fwd_context = { + # initialization below will fail because target layer is invalid; + # the target layer needs to come before layer 1 + layer_0: + Attention( + num_heads=8, + head_size=64, + scale=1.0, + prefix=layer_0, + kv_sharing_target_layer_name=layer_1, + ), + layer_1: + Attention( + num_heads=8, + head_size=64, + scale=1.0, + prefix=layer_1, + ) + } + # suppress var not used error + assert fwd_context is not None + + +def test_init_kv_cache_with_kv_sharing_target_layer_not_exist(): + layer_0 = "model.layers.0.self_attn.attn" + layer_1 = "model.layers.1.self_attn.attn" + invalid_layer = "model.layers.0.cross_attn.attn" + error_msg = f"{invalid_layer} is not a valid Attention layer in the model" + with pytest.raises(ValueError, match=error_msg): + fwd_context = { + layer_0: + Attention( + num_heads=8, + head_size=64, + scale=1.0, + prefix=layer_0, + ), + layer_1: + Attention( + num_heads=8, + head_size=64, + scale=1.0, + prefix=layer_1, + # invalid layer: cross_attn.atn doesn't exist! + kv_sharing_target_layer_name=invalid_layer, + ) + } + # suppress var not used error + assert fwd_context is not None + + +def test_init_kv_cache_with_kv_sharing_target_same_as_current(): + layer_0 = "model.layers.0.self_attn.attn" + layer_1 = "model.layers.1.self_attn.attn" + error_msg = f"{layer_1} cannot be the same as the current layer" + with pytest.raises(ValueError, match=error_msg): + fwd_context = { + # initialization below will fail because target layer is invalid; + # the target layer needs to come before layer 1 + layer_0: + Attention( + num_heads=8, + head_size=64, + scale=1.0, + prefix=layer_0, + ), + layer_1: + Attention( + num_heads=8, + head_size=64, + scale=1.0, + prefix=layer_1, + kv_sharing_target_layer_name=layer_1, + ) + } + # suppress var not used error + assert fwd_context is not None + + +def test_init_kv_cache_without_kv_sharing(): + layer_0 = "model.layers.0.self_attn.attn" + layer_1 = "model.layers.1.self_attn.attn" + vllm_config = get_vllm_config() + with set_current_vllm_config(vllm_config): + fwd_context = { + layer_0: + Attention( + num_heads=8, + head_size=64, + scale=1.0, + prefix=layer_0, + ), + layer_1: + Attention( + num_heads=8, + head_size=64, + scale=1.0, + prefix=layer_1, + ) + } + # suppress var not used error + assert fwd_context is not None + # Set high context length to test max context length estimation + vllm_config.model_config.max_model_len = 3_000_000 + vllm_ctx = vllm_config.compilation_config.static_forward_context + runner = GPUModelRunner(vllm_config, DEVICE) + kv_cache_spec = runner.get_kv_cache_spec() + assert len(kv_cache_spec) == 2 + assert len(runner.shared_kv_cache_layers) == 0 + + available_memory = 20 * GiB_bytes + # page size for layer 0's kv_cache_spec is 32KB + num_expected_blocks = 327680 # 20GB / 32KB / 2 (num layers) + kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec, + available_memory) + assert kv_cache_config.num_blocks == num_expected_blocks + assert len(kv_cache_config.tensors) == 2 + assert kv_cache_config.tensors[layer_0].size == available_memory // 2 + assert kv_cache_config.tensors[layer_1].size == available_memory // 2 + + max_context_len =\ + estimate_max_model_len(vllm_config, kv_cache_spec, 5 * GiB_bytes) + # max context len with KV sharing should be 2x as large as without + assert max_context_len == 1310720 + + # important: override tensor size to prevent large mem alloc during test + # this will only allocate 2 block worth of memory (2 * 32kb) + kv_cache_config.num_blocks = 1 + for layer in kv_cache_config.tensors: + kv_cache_config.tensors[layer].size =\ + kv_cache_spec[layer].page_size_bytes + + runner.initialize_kv_cache(kv_cache_config) + + layer_0_kv = vllm_ctx[layer_0].kv_cache[0] + layer_1_kv = vllm_ctx[layer_1].kv_cache[0] + # check layer 1 kv cache does NOT share memory with layer 0 + assert id(layer_1_kv) != id(layer_0_kv) + + # check layer 1 added to kv cache group's layer names + assert len(kv_cache_config.kv_cache_groups) == 1 + assert len(kv_cache_config.kv_cache_groups[0].layer_names) == 2 + assert kv_cache_config.kv_cache_groups[0].layer_names[0] == layer_0 + assert kv_cache_config.kv_cache_groups[0].layer_names[1] == layer_1 + + +def test_init_kv_cache_with_kv_sharing_valid(): + layer_0 = "model.layers.0.self_attn.attn" + layer_1 = "model.layers.1.self_attn.attn" + vllm_config = get_vllm_config() + with set_current_vllm_config(vllm_config): + fwd_context = { + layer_0: + Attention( + num_heads=8, + head_size=64, + scale=1.0, + prefix=layer_0, + ), + layer_1: + Attention( + num_heads=8, + head_size=64, + scale=1.0, + prefix=layer_1, + kv_sharing_target_layer_name="model.layers.0.self_attn.attn", + ) + } + # suppress var not used error + assert fwd_context is not None + # Set high context length to test max context length estimation + vllm_config.model_config.max_model_len = 3_000_000 + vllm_ctx = vllm_config.compilation_config.static_forward_context + runner = GPUModelRunner(vllm_config, DEVICE) + kv_cache_spec = runner.get_kv_cache_spec() + assert len(kv_cache_spec) == 1 + assert layer_0 in kv_cache_spec + assert runner.shared_kv_cache_layers[layer_1] == layer_0 + + available_memory = 20 * GiB_bytes + # page size for layer 0's kv_cache_spec is 32KB + # with KV sharing, we can allocate (available_mem//page_size//1) blocks + # which is twice as many as without KV sharing + num_expected_blocks = 655360 # 20GB / 32KB + kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec, + available_memory) + assert kv_cache_config.num_blocks == num_expected_blocks + assert len(kv_cache_config.tensors) == 1 + # Each layer now has twice the available memory for KV cache + # compared to no KV sharing + assert kv_cache_config.tensors[layer_0].size == available_memory + + max_context_len =\ + estimate_max_model_len(vllm_config, kv_cache_spec, 5 * GiB_bytes) + # max context len with KV sharing should be 2x as large as without + assert max_context_len == 2 * 1310720 + + # important: override tensor size to prevent large mem alloc during test + # this will only allocate 1 block worth of memory (32kb) + kv_cache_config.num_blocks = 1 + kv_cache_config.tensors[layer_0].size =\ + kv_cache_spec[layer_0].page_size_bytes + + runner.initialize_kv_cache(kv_cache_config) + + layer_0_kv = vllm_ctx[layer_0].kv_cache[0] + layer_1_kv = vllm_ctx[layer_1].kv_cache[0] + # check layer 1 kv cache shares memory with layer 0 + assert id(layer_1_kv) == id(layer_0_kv) + + # check layer 1 added to kv cache group's layer names + assert len(kv_cache_config.kv_cache_groups) == 1 + assert len(kv_cache_config.kv_cache_groups[0].layer_names) == 2 + assert kv_cache_config.kv_cache_groups[0].layer_names[0] == layer_0 + assert kv_cache_config.kv_cache_groups[0].layer_names[1] == layer_1 diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py index deb3951d6617b..0ba5a5bf94c9b 100644 --- a/vllm/attention/backends/abstract.py +++ b/vllm/attention/backends/abstract.py @@ -270,6 +270,7 @@ class AttentionImpl(ABC, Generic[T]): blocksparse_params: Optional[Dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: str = AttentionType.DECODER, + kv_sharing_target_layer_name: Optional[str] = None, ) -> None: raise NotImplementedError diff --git a/vllm/attention/backends/blocksparse_attn.py b/vllm/attention/backends/blocksparse_attn.py index a2fd557f8e0cb..c1663516de358 100644 --- a/vllm/attention/backends/blocksparse_attn.py +++ b/vllm/attention/backends/blocksparse_attn.py @@ -306,7 +306,10 @@ class BlocksparseFlashAttentionImpl(AttentionImpl): blocksparse_params: Optional[Dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: str = AttentionType.DECODER, + kv_sharing_target_layer_name: Optional[str] = None, ) -> None: + if kv_sharing_target_layer_name is not None: + raise NotImplementedError("KV sharing is not supported in V0.") assert blocksparse_params is not None assert alibi_slopes is None, ValueError( "Alibi not support for blocksparse flash attention.") diff --git a/vllm/attention/backends/cpu_mla.py b/vllm/attention/backends/cpu_mla.py index 39e667bca9cd2..cf7883e121abb 100644 --- a/vllm/attention/backends/cpu_mla.py +++ b/vllm/attention/backends/cpu_mla.py @@ -206,12 +206,13 @@ class CPUMLAImpl(MLACommonImpl[CPUMLAMetadata]): blocksparse_params: Optional[Dict[str, Any]], logits_soft_cap: Optional[float], attn_type: str, + kv_sharing_target_layer_name: Optional[str], # MLA Specific Arguments **mla_args) -> None: super().__init__(num_heads, head_size, scale, num_kv_heads, alibi_slopes, sliding_window, kv_cache_dtype, blocksparse_params, logits_soft_cap, attn_type, - **mla_args) + kv_sharing_target_layer_name, **mla_args) unsupported_features = [ alibi_slopes, sliding_window, blocksparse_params, logits_soft_cap diff --git a/vllm/attention/backends/dual_chunk_flash_attn.py b/vllm/attention/backends/dual_chunk_flash_attn.py index 3548df88d0c5d..963bccdf21bc0 100644 --- a/vllm/attention/backends/dual_chunk_flash_attn.py +++ b/vllm/attention/backends/dual_chunk_flash_attn.py @@ -290,9 +290,12 @@ class DualChunkFlashAttentionImpl(FlashAttentionImpl): blocksparse_params: Optional[Dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: str = AttentionType.DECODER, + kv_sharing_target_layer_name: Optional[str] = None, layer_idx: int = -1, dual_chunk_attention_config: Optional[Dict[str, Any]] = None, ) -> None: + if kv_sharing_target_layer_name is not None: + raise NotImplementedError("KV sharing is not supported in V0.") self.num_heads = num_heads self.head_size = head_size self.scale = float(scale) diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py index 26be2c04f297e..73e3772682e69 100755 --- a/vllm/attention/backends/flash_attn.py +++ b/vllm/attention/backends/flash_attn.py @@ -618,8 +618,11 @@ class FlashAttentionImpl(AttentionImpl): blocksparse_params: Optional[Dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: str = AttentionType.DECODER, + kv_sharing_target_layer_name: Optional[str] = None, use_irope: bool = False, ) -> None: + if kv_sharing_target_layer_name is not None: + raise NotImplementedError("KV sharing is not supported in V0.") if blocksparse_params is not None: raise ValueError( "FlashAttention does not support block-sparse attention.") diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py index 7ae7ea37f4afc..a3937760f03b8 100644 --- a/vllm/attention/backends/flashinfer.py +++ b/vllm/attention/backends/flashinfer.py @@ -936,8 +936,11 @@ class FlashInferImpl(AttentionImpl): blocksparse_params: Optional[Dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: str = AttentionType.DECODER, + kv_sharing_target_layer_name: Optional[str] = None, use_irope: bool = False, ) -> None: + if kv_sharing_target_layer_name is not None: + raise NotImplementedError("KV sharing is not supported in V0.") if use_irope: logger.warning_once( "Using irope in FlashInfer is not supported yet, it will fall" diff --git a/vllm/attention/backends/flashmla.py b/vllm/attention/backends/flashmla.py index 9a6b8a40e1311..e185d0260d0a0 100644 --- a/vllm/attention/backends/flashmla.py +++ b/vllm/attention/backends/flashmla.py @@ -184,12 +184,13 @@ class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]): blocksparse_params: Optional[Dict[str, Any]], logits_soft_cap: Optional[float], attn_type: str, + kv_sharing_target_layer_name: Optional[str] = None, # MLA Specific Arguments **mla_args) -> None: super().__init__(num_heads, head_size, scale, num_kv_heads, alibi_slopes, sliding_window, kv_cache_dtype, blocksparse_params, logits_soft_cap, attn_type, - **mla_args) + kv_sharing_target_layer_name, **mla_args) assert is_flashmla_supported(), \ "FlashMLA is not supported on this device" diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py index 5128e49752e11..9bd513fd894f5 100644 --- a/vllm/attention/backends/hpu_attn.py +++ b/vllm/attention/backends/hpu_attn.py @@ -110,9 +110,12 @@ class HPUAttentionImpl(AttentionImpl, torch.nn.Module): blocksparse_params: Optional[Dict[str, Any]] = None, max_seq_len: int = 4096, attn_type: str = AttentionType.DECODER, + kv_sharing_target_layer_name: Optional[str] = None, use_irope: bool = False, ) -> None: super(AttentionImpl, self).__init__() + if kv_sharing_target_layer_name is not None: + raise NotImplementedError("KV sharing is not supported in V0.") if use_irope: logger.warning_once( "Using irope in HPU is not supported yet, it will fall back " diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py index 30441b3ad136a..5051c6a7cc4fd 100644 --- a/vllm/attention/backends/ipex_attn.py +++ b/vllm/attention/backends/ipex_attn.py @@ -123,8 +123,11 @@ class IpexAttnBackendImpl(AttentionImpl[IpexAttnMetadata]): blocksparse_params: Optional[Dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: str = AttentionType.DECODER, + kv_sharing_target_layer_name: Optional[str] = None, use_irope: bool = False, ) -> None: + if kv_sharing_target_layer_name is not None: + raise NotImplementedError("KV sharing is not supported in V0.") if use_irope: logger.warning_once( "Using irope in Ipex is not supported yet, it will fall" diff --git a/vllm/attention/backends/mla/common.py b/vllm/attention/backends/mla/common.py index 50842abd3924f..78cf952881303 100644 --- a/vllm/attention/backends/mla/common.py +++ b/vllm/attention/backends/mla/common.py @@ -1000,6 +1000,7 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]): blocksparse_params: Optional[Dict[str, Any]], logits_soft_cap: Optional[float], attn_type: str, + kv_sharing_target_layer_name: Optional[str], # MLA Specific Arguments q_lora_rank: Optional[int], kv_lora_rank: int, @@ -1009,6 +1010,8 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]): v_head_dim: int, kv_b_proj: ColumnParallelLinear, ) -> None: + if kv_sharing_target_layer_name is not None: + raise NotImplementedError("KV sharing not supported in V0.") self.num_heads = num_heads self.head_size = head_size self.scale = float(scale) diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py index a6823ac059fb7..7ad67615d33d9 100644 --- a/vllm/attention/backends/pallas.py +++ b/vllm/attention/backends/pallas.py @@ -109,8 +109,11 @@ class PallasAttentionBackendImpl(AttentionImpl): blocksparse_params: Optional[Dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: str = AttentionType.DECODER, + kv_sharing_target_layer_name: Optional[str] = None, use_irope: bool = False, ) -> None: + if kv_sharing_target_layer_name is not None: + raise NotImplementedError("KV sharing is not supported in V0.") if use_irope: logger.warning_once( "Using irope in Pallas is not supported yet, it will fall back " diff --git a/vllm/attention/backends/rocm_aiter_mla.py b/vllm/attention/backends/rocm_aiter_mla.py index 855036071d0d1..1edf34351db3f 100644 --- a/vllm/attention/backends/rocm_aiter_mla.py +++ b/vllm/attention/backends/rocm_aiter_mla.py @@ -370,12 +370,13 @@ class AiterMLAImpl(MLACommonImpl[AiterMLAMetadata]): blocksparse_params: Optional[dict[str, Any]], logits_soft_cap: Optional[float], attn_type: str, + kv_sharing_target_layer_name: Optional[str], # MLA Specific Arguments **mla_args) -> None: super().__init__(num_heads, head_size, scale, num_kv_heads, alibi_slopes, sliding_window, kv_cache_dtype, blocksparse_params, logits_soft_cap, attn_type, - **mla_args) + kv_sharing_target_layer_name, **mla_args) unsupported_features = [ alibi_slopes, sliding_window, blocksparse_params, logits_soft_cap diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py index 755e0da06cef9..4b460dc0b58cd 100644 --- a/vllm/attention/backends/rocm_flash_attn.py +++ b/vllm/attention/backends/rocm_flash_attn.py @@ -494,8 +494,11 @@ class ROCmFlashAttentionImpl(AttentionImpl): blocksparse_params: Optional[Dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: str = AttentionType.DECODER, + kv_sharing_target_layer_name: Optional[str] = None, use_irope: bool = False, ) -> None: + if kv_sharing_target_layer_name is not None: + raise NotImplementedError("KV sharing is not supported in V0.") if use_irope: logger.warning_once( "Using irope in ROCm Flash Attention is not supported yet, it " diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py index 7606340044f1d..f3fb5adcf05ce 100644 --- a/vllm/attention/backends/torch_sdpa.py +++ b/vllm/attention/backends/torch_sdpa.py @@ -405,8 +405,11 @@ class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]): blocksparse_params: Optional[Dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: str = AttentionType.DECODER, + kv_sharing_target_layer_name: Optional[str] = None, use_irope: bool = False, ) -> None: + if kv_sharing_target_layer_name is not None: + raise NotImplementedError("KV sharing is not supported in V0.") if blocksparse_params is not None: raise ValueError( "Torch SPDA does not support block-sparse attention.") diff --git a/vllm/attention/backends/triton_mla.py b/vllm/attention/backends/triton_mla.py index d9fff8fac1584..e06f7d54e3421 100644 --- a/vllm/attention/backends/triton_mla.py +++ b/vllm/attention/backends/triton_mla.py @@ -38,12 +38,13 @@ class TritonMLAImpl(MLACommonImpl[MLACommonMetadata]): blocksparse_params: Optional[Dict[str, Any]], logits_soft_cap: Optional[float], attn_type: str, + kv_sharing_target_layer_name: Optional[str], # MLA Specific Arguments **mla_args) -> None: super().__init__(num_heads, head_size, scale, num_kv_heads, alibi_slopes, sliding_window, kv_cache_dtype, blocksparse_params, logits_soft_cap, attn_type, - **mla_args) + kv_sharing_target_layer_name, **mla_args) unsupported_features = [ alibi_slopes, sliding_window, blocksparse_params, logits_soft_cap diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py index 8355e03977e78..04ef928b7d7b3 100644 --- a/vllm/attention/backends/xformers.py +++ b/vllm/attention/backends/xformers.py @@ -390,8 +390,11 @@ class XFormersImpl(AttentionImpl[XFormersMetadata]): blocksparse_params: Optional[Dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: str = AttentionType.DECODER, + kv_sharing_target_layer_name: Optional[str] = None, use_irope: bool = False, ) -> None: + if kv_sharing_target_layer_name is not None: + raise NotImplementedError("KV sharing is not supported in V0.") if blocksparse_params is not None: raise ValueError( "XFormers does not support block-sparse attention.") diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 6c5b05a5c7b14..a5fbd1a1c0166 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -21,6 +21,7 @@ from vllm.model_executor.layers.quantization.base_config import ( from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod from vllm.platforms import _Backend, current_platform from vllm.utils import direct_register_custom_op +from vllm.v1.attention.backends.utils import validate_kv_sharing_target class Attention(nn.Module): @@ -50,6 +51,7 @@ class Attention(nn.Module): use_mla: bool = False, prefix: str = "", attn_type: str = AttentionType.DECODER, + kv_sharing_target_layer_name: Optional[str] = None, **extra_impl_args, ) -> None: """ @@ -135,7 +137,7 @@ class Attention(nn.Module): self.impl = impl_cls(num_heads, head_size, scale, num_kv_heads, alibi_slopes, sliding_window, kv_cache_dtype, blocksparse_params, logits_soft_cap, attn_type, - **extra_impl_args) + kv_sharing_target_layer_name, **extra_impl_args) self.backend = backend_name_to_enum(attn_backend.get_name()) self.dtype = dtype @@ -153,6 +155,19 @@ class Attention(nn.Module): compilation_config.static_forward_context[prefix] = self self.layer_name = prefix self.attn_type = attn_type + + if kv_sharing_target_layer_name is not None: + if not envs.VLLM_USE_V1: + raise NotImplementedError( + "Cross-layer KV sharing is not supported in V0.") + + validate_kv_sharing_target( + prefix, + kv_sharing_target_layer_name, + compilation_config.static_forward_context, + ) + self.kv_sharing_target_layer_name = kv_sharing_target_layer_name + # use a placeholder kv cache tensor during init, which will be replaced # by bind_kv_cache # this variable will not be accessed if use_direct_call is True diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index 9e989df1cd892..a92c51883af1c 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -485,6 +485,7 @@ class FlashAttentionImpl(AttentionImpl): blocksparse_params: Optional[dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: AttentionType = AttentionType.DECODER, + kv_sharing_target_layer_name: Optional[str] = None, use_irope: bool = False, ) -> None: if blocksparse_params is not None: @@ -506,6 +507,7 @@ class FlashAttentionImpl(AttentionImpl): # In flash-attn, setting logits_soft_cap as 0 means no soft cap. logits_soft_cap = 0 self.logits_soft_cap = logits_soft_cap + self.kv_sharing_target_layer_name = kv_sharing_target_layer_name assert self.num_heads % self.num_kv_heads == 0 self.num_queries_per_kv = self.num_heads // self.num_kv_heads @@ -569,22 +571,26 @@ class FlashAttentionImpl(AttentionImpl): # performance to make sure it does not introduce any overhead. num_actual_tokens = attn_metadata.num_actual_tokens - # Reshape the input keys and values and store them in the cache. - # NOTE(woosuk): Here, key and value are padded while slot_mapping is - # not padded. However, we don't need to do key[:num_actual_tokens] and - # value[:num_actual_tokens] because the reshape_and_cache_flash op uses - # the slot_mapping's shape to determine the number of actual tokens. key_cache, value_cache = kv_cache.unbind(0) - torch.ops._C_cache_ops.reshape_and_cache_flash( - key, - value, - key_cache, - value_cache, - attn_metadata.slot_mapping, - self.kv_cache_dtype, - layer._k_scale, - layer._v_scale, - ) + + if self.kv_sharing_target_layer_name is None: + # Reshape the input keys and values and store them in the cache. + # Skip this if sharing KV cache with an earlier attention layer. + # NOTE(woosuk): Here, key and value are padded while slot_mapping is + # not padded. However, we don't need to do key[:num_actual_tokens] + # and value[:num_actual_tokens] because the reshape_and_cache_flash + # op uses the slot_mapping's shape to determine the number of + # actual tokens. + torch.ops._C_cache_ops.reshape_and_cache_flash( + key, + value, + key_cache, + value_cache, + attn_metadata.slot_mapping, + self.kv_cache_dtype, + layer._k_scale, + layer._v_scale, + ) if self.kv_cache_dtype.startswith("fp8"): key_cache = key_cache.view(torch.float8_e4m3fn) diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index 8bd998eba7695..f1b61c152a9d8 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -507,6 +507,7 @@ class FlashInferImpl(AttentionImpl): blocksparse_params: Optional[dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: AttentionType = AttentionType.DECODER, + kv_sharing_target_layer_name: Optional[int] = None, ) -> None: self.num_heads = num_heads self.head_size = head_size @@ -521,6 +522,7 @@ class FlashInferImpl(AttentionImpl): self.sliding_window = (sliding_window - 1, 0) self.kv_cache_dtype = kv_cache_dtype self.logits_soft_cap = logits_soft_cap + self.kv_sharing_target_layer_name = kv_sharing_target_layer_name assert self.num_heads % self.num_kv_heads == 0 self.num_queries_per_kv = self.num_heads // self.num_kv_heads @@ -568,21 +570,25 @@ class FlashInferImpl(AttentionImpl): # performance to make sure it does not introduce any overhead. num_actual_tokens = attn_metadata.num_actual_tokens - # Reshape the input keys and values and store them in the cache. - # NOTE(woosuk): Here, key and value are padded while slot_mapping is - # not padded. However, we don't need to do key[:num_actual_tokens] and - # value[:num_actual_tokens] because the reshape_and_cache_flash op uses - # the slot_mapping's shape to determine the number of actual tokens. - torch.ops._C_cache_ops.reshape_and_cache_flash( - key, - value, - kv_cache[:, 0], - kv_cache[:, 1], - attn_metadata.slot_mapping, - self.kv_cache_dtype, - layer._k_scale, - layer._v_scale, - ) + + if self.kv_sharing_target_layer_name is None: + # Reshape the input keys and values and store them in the cache. + # Skip this if sharing KV cache with an earlier attention layer. + # NOTE(woosuk): Here, key and value are padded while slot_mapping is + # not padded. However, we don't need to do key[:num_actual_tokens] + # and value[:num_actual_tokens] because the reshape_and_cache_flash + # op uses the slot_mapping's shape to determine the number of + # actual tokens. + torch.ops._C_cache_ops.reshape_and_cache_flash( + key, + value, + kv_cache[:, 0], + kv_cache[:, 1], + attn_metadata.slot_mapping, + self.kv_cache_dtype, + layer._k_scale, + layer._v_scale, + ) window_left = (self.sliding_window[0] if self.sliding_window is not None else -1) diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index 96befca5a1e94..06acbb909a4f6 100644 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -586,6 +586,7 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]): blocksparse_params: Optional[dict[str, Any]], logits_soft_cap: Optional[float], attn_type: str, + kv_sharing_target_layer_name: Optional[str], # MLA Specific Arguments q_lora_rank: Optional[int], kv_lora_rank: int, @@ -595,6 +596,9 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]): v_head_dim: int, kv_b_proj: ColumnParallelLinear, ) -> None: + if kv_sharing_target_layer_name is not None: + raise NotImplementedError("KV sharing is not supported for MLA") + self.num_heads = num_heads self.head_size = head_size self.scale = float(scale) diff --git a/vllm/v1/attention/backends/mla/flashmla.py b/vllm/v1/attention/backends/mla/flashmla.py index 060a7c9d8c853..318b8ede14366 100644 --- a/vllm/v1/attention/backends/mla/flashmla.py +++ b/vllm/v1/attention/backends/mla/flashmla.py @@ -93,12 +93,13 @@ class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]): blocksparse_params: Optional[dict[str, Any]], logits_soft_cap: Optional[float], attn_type: str, + kv_sharing_target_layer_name: Optional[str], # MLA Specific Arguments **mla_args) -> None: super().__init__(num_heads, head_size, scale, num_kv_heads, alibi_slopes, sliding_window, kv_cache_dtype, blocksparse_params, logits_soft_cap, attn_type, - **mla_args) + kv_sharing_target_layer_name, **mla_args) assert is_flashmla_supported(), \ "FlashMLA is not supported on this device" diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py index 8925b5a5cd7d0..1f0406a7ac1f8 100644 --- a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py +++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py @@ -139,12 +139,13 @@ class AiterMLAImpl(MLACommonImpl[AiterMLAMetadata]): blocksparse_params: Optional[dict[str, Any]], logits_soft_cap: Optional[float], attn_type: str, + kv_sharing_target_layer_name: Optional[str], # MLA Specific Arguments **mla_args) -> None: super().__init__(num_heads, head_size, scale, num_kv_heads, alibi_slopes, sliding_window, kv_cache_dtype, blocksparse_params, logits_soft_cap, attn_type, - **mla_args) + kv_sharing_target_layer_name, **mla_args) assert (num_heads == 16 or num_heads == 128), ( f"Aiter MLA only supports 16 or 128 number of heads.\n" f"Provided {num_heads} number of heads.\n" diff --git a/vllm/v1/attention/backends/mla/triton_mla.py b/vllm/v1/attention/backends/mla/triton_mla.py index 0857fc133c431..e26d7909184b5 100644 --- a/vllm/v1/attention/backends/mla/triton_mla.py +++ b/vllm/v1/attention/backends/mla/triton_mla.py @@ -41,12 +41,13 @@ class TritonMLAImpl(MLACommonImpl[MLACommonMetadata]): blocksparse_params: Optional[dict[str, Any]], logits_soft_cap: Optional[float], attn_type: str, + kv_sharing_target_layer_name: Optional[str], # MLA Specific Arguments **mla_args) -> None: super().__init__(num_heads, head_size, scale, num_kv_heads, alibi_slopes, sliding_window, kv_cache_dtype, blocksparse_params, logits_soft_cap, attn_type, - **mla_args) + kv_sharing_target_layer_name, **mla_args) unsupported_features = [ alibi_slopes, sliding_window, blocksparse_params, logits_soft_cap diff --git a/vllm/v1/attention/backends/pallas.py b/vllm/v1/attention/backends/pallas.py index 896f1394cfa4b..0f956ba88b9c1 100644 --- a/vllm/v1/attention/backends/pallas.py +++ b/vllm/v1/attention/backends/pallas.py @@ -113,6 +113,7 @@ class PallasAttentionBackendImpl(AttentionImpl): blocksparse_params: Optional[dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: str = AttentionType.DECODER, + kv_sharing_target_layer_name: Optional[int] = None, use_irope: bool = False, ) -> None: if use_irope: @@ -128,6 +129,7 @@ class PallasAttentionBackendImpl(AttentionImpl): self.num_kv_heads = num_kv_heads self.sliding_window = sliding_window self.logits_soft_cap = logits_soft_cap + self.kv_sharing_target_layer_name = kv_sharing_target_layer_name assert self.num_heads % self.num_kv_heads == 0 self.num_queries_per_kv = self.num_heads // self.num_kv_heads @@ -181,7 +183,9 @@ class PallasAttentionBackendImpl(AttentionImpl): num_tokens, hidden_size = query.shape query = query.view(num_tokens, self.num_heads, self.head_size) - if kv_cache.numel() > 0: + if self.kv_sharing_target_layer_name is None and kv_cache.numel() > 0: + # Write input keys and values to the KV cache. + # Skip this if sharing KV cache with an earlier attention layer. slot_mapping = attn_metadata.slot_mapping write_to_kv_cache(key, value, kv_cache, slot_mapping) diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py index 6a3314dd87889..968f137011186 100644 --- a/vllm/v1/attention/backends/triton_attn.py +++ b/vllm/v1/attention/backends/triton_attn.py @@ -88,6 +88,7 @@ class TritonAttentionImpl(AttentionImpl): blocksparse_params: Optional[dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: AttentionType = AttentionType.DECODER, + kv_sharing_target_layer_name: Optional[int] = None, use_irope: bool = False, ) -> None: if blocksparse_params is not None: @@ -109,6 +110,7 @@ class TritonAttentionImpl(AttentionImpl): # In flash-attn, setting logits_soft_cap as 0 means no soft cap. logits_soft_cap = 0 self.logits_soft_cap = logits_soft_cap + self.kv_sharing_target_layer_name = kv_sharing_target_layer_name self.use_irope = use_irope @@ -178,31 +180,34 @@ class TritonAttentionImpl(AttentionImpl): if use_prefill_decode_attn: key_cache, value_cache = PagedAttention.split_kv_cache( kv_cache, self.num_kv_heads, self.head_size) - - # Reshape the input keys and values and store them in the cache. - PagedAttention.write_to_paged_cache( - key, - value, - key_cache, - value_cache, - attn_metadata.slot_mapping, - self.kv_cache_dtype, - layer._k_scale, - layer._v_scale, - ) - else: key_cache, value_cache = kv_cache.unbind(0) - torch.ops._C_cache_ops.reshape_and_cache_flash( - key, - value, - key_cache, - value_cache, - attn_metadata.slot_mapping, - self.kv_cache_dtype, - layer._k_scale, - layer._v_scale, - ) + + if self.kv_sharing_target_layer_name is None: + # Reshape the input keys and values and store them in the cache. + # Skip this if sharing KV cache with an earlier attention layer. + if use_prefill_decode_attn: + PagedAttention.write_to_paged_cache( + key, + value, + key_cache, + value_cache, + attn_metadata.slot_mapping, + self.kv_cache_dtype, + layer._k_scale, + layer._v_scale, + ) + else: + torch.ops._C_cache_ops.reshape_and_cache_flash( + key, + value, + key_cache, + value_cache, + attn_metadata.slot_mapping, + self.kv_cache_dtype, + layer._k_scale, + layer._v_scale, + ) if self.kv_cache_dtype.startswith("fp8"): key_cache = key_cache.view(self.fp8_dtype) diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py index 2e65619ed7bc8..72c7643539273 100644 --- a/vllm/v1/attention/backends/utils.py +++ b/vllm/v1/attention/backends/utils.py @@ -17,3 +17,36 @@ class CommonAttentionMetadata: seq_lens: torch.Tensor """(batch_size,), the length of each request including both computed tokens and newly scheduled tokens""" + + +def validate_kv_sharing_target(current_layer_name, target_layer_name, + static_forward_context): + error_msg = (f"Specified KV sharing target layer for {current_layer_name} " + f"is not valid: target layer {target_layer_name} ") + + if current_layer_name == target_layer_name: + raise ValueError(error_msg + + "cannot be the same as the current layer.") + + if target_layer_name not in static_forward_context: + from vllm.model_executor.models.utils import extract_layer_index + + # If target layer name is not in the static fwd context, it means either + # a) the target layer does not come BEFORE the current layer, or + # b) the target layer is not an Attention layer that exists in the model + current_layer_idx = extract_layer_index(current_layer_name) + target_layer_idx = extract_layer_index(target_layer_name) + if current_layer_idx <= target_layer_idx: + raise ValueError(error_msg + "must come before the current layer.") + else: + raise ValueError(error_msg + + "is not a valid Attention layer in the model.") + + # Currently KV sharing is only supported between layers of the same type + target_layer_attn_type = static_forward_context[ + target_layer_name].attn_type + expected = static_forward_context[current_layer_name].attn_type + if target_layer_attn_type != expected: + raise ValueError( + error_msg + + f"must be the same type as the current layer ({expected}).") diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index c96ad0c015301..b7448be26f107 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -59,8 +59,8 @@ from vllm.v1.worker.block_table import BlockTable from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin -from .utils import (gather_mm_placeholders, sanity_check_mm_encoder_outputs, - scatter_mm_placeholders) +from .utils import (gather_mm_placeholders, initialize_kv_cache_for_kv_sharing, + sanity_check_mm_encoder_outputs, scatter_mm_placeholders) if TYPE_CHECKING: import xgrammar as xgr @@ -276,6 +276,12 @@ class GPUModelRunner(LoRAModelRunnerMixin): pin_memory=self.pin_memory) self.seq_lens_np = self.seq_lens_cpu.numpy() + # Layer pairings for cross-layer KV sharing. + # If an Attention layer `layer_name` is in the keys of this dict, it + # means this layer will perform attention using the keys and values + # from the KV cache of `shared_kv_cache_layers[layer_name]`. + self.shared_kv_cache_layers: dict[str, str] = {} + def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> bool: """ Update the order of requests in the batch based on the attention @@ -2097,6 +2103,15 @@ class GPUModelRunner(LoRAModelRunnerMixin): # KV cache specs. raise ValueError("Unknown KV cache spec type.") + # Setup `kv_cache_config` and `kv_caches` for models + # with cross-layer KV sharing + if self.shared_kv_cache_layers: + initialize_kv_cache_for_kv_sharing( + self.shared_kv_cache_layers, + kv_cache_config.kv_cache_groups, + kv_caches, + ) + if self.speculative_config and self.speculative_config.use_eagle(): assert isinstance(self.drafter, EagleProposer) # validate all draft model layers belong to the same kv cache @@ -2125,6 +2140,18 @@ class GPUModelRunner(LoRAModelRunnerMixin): use_mla = self.vllm_config.model_config.use_mla kv_cache_spec: dict[str, KVCacheSpec] = {} for layer_name, attn_module in layers.items(): + if (kv_tgt_layer := + attn_module.kv_sharing_target_layer_name) is not None: + # The layer doesn't need its own KV cache and will use that of + # the target layer. We skip creating a KVCacheSpec for it, so + # that KV cache management logic will act as this layer does + # not exist, and doesn't allocate KV cache for the layer. This + # enables the memory saving of cross-layer kv sharing, allowing + # a given amount of memory to accommodate longer context lengths + # or enable more requests to be processed simultaneously. + self.shared_kv_cache_layers[layer_name] = kv_tgt_layer + continue + # TODO: Support other attention modules, e.g., cross-attention if attn_module.attn_type == AttentionType.DECODER: if attn_module.sliding_window is not None: diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 48ea3cb7bff0d..f15234f49ce05 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -44,7 +44,8 @@ from vllm.v1.utils import bind_kv_cache from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin -from .utils import sanity_check_mm_encoder_outputs +from .utils import (initialize_kv_cache_for_kv_sharing, + sanity_check_mm_encoder_outputs) if TYPE_CHECKING: from vllm.v1.core.sched.output import SchedulerOutput @@ -238,6 +239,12 @@ class TPUModelRunner(LoRAModelRunnerMixin): self.num_reqs_paddings = _get_req_paddings( min_req_size=MIN_NUM_SEQS, max_req_size=self.max_num_reqs) + # Layer pairings for cross-layer KV sharing. + # If an Attention layer `layer_name` is in the keys of this dict, it + # means this layer will perform attention using the keys and values + # from the KV cache of `shared_kv_cache_layers[layer_name]`. + self.shared_kv_cache_layers: dict[str, str] = {} + # tensors for structured decoding self.grammar_bitmask_cpu = torch.zeros( (self.max_num_reqs, cdiv(self.vocab_size, 32)), @@ -455,6 +462,18 @@ class TPUModelRunner(LoRAModelRunnerMixin): block_size = self.vllm_config.cache_config.block_size kv_cache_spec: dict[str, KVCacheSpec] = {} for layer_name, attn_module in layers.items(): + if (kv_tgt_layer := + attn_module.kv_sharing_target_layer_name) is not None: + # The layer doesn't need its own KV cache and will use that of + # the target layer. We skip creating a KVCacheSpec for it, so + # that KV cache management logic will act as this layer does + # not exist, and doesn't allocate KV cache for the layer. This + # enables the memory saving of cross-layer kv sharing, allowing + # a given amount of memory to accommodate longer context lengths + # or enable more requests to be processed simultaneously. + self.shared_kv_cache_layers[layer_name] = kv_tgt_layer + continue + if attn_module.attn_type == AttentionType.DECODER: if attn_module.sliding_window is not None: kv_cache_spec[layer_name] = SlidingWindowSpec( @@ -1376,6 +1395,15 @@ class TPUModelRunner(LoRAModelRunnerMixin): else: raise NotImplementedError + # Setup `kv_cache_config` and `kv_caches` for models + # with cross-layer KV sharing + if self.shared_kv_cache_layers: + initialize_kv_cache_for_kv_sharing( + self.shared_kv_cache_layers, + kv_cache_config.kv_cache_groups, + kv_caches, + ) + bind_kv_cache( kv_caches, self.vllm_config.compilation_config.static_forward_context, diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index b23b28c1d7e9c..055cf01530f02 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -4,6 +4,8 @@ from typing import Optional import torch +from vllm.v1.kv_cache_interface import KVCacheGroupSpec + def sanity_check_mm_encoder_outputs( mm_embeddings: object, @@ -73,3 +75,37 @@ def gather_mm_placeholders( return placeholders return placeholders[is_embed] + + +def initialize_kv_cache_for_kv_sharing( + shared_kv_cache_layers: dict[str, str], + kv_cache_groups: list[KVCacheGroupSpec], + kv_caches: dict[str, torch.Tensor], +) -> None: + """ + Sets up KV cache sharing by reusing the allocated KV caches in `kv_caches` + for layers that do not allocate its own KV cache, based on the mapping in + `shared_kv_cache_layers`. Adds these layers to the corresponding KV cache + group, which is needed to ensure that attention metadata is assigned later. + + Args: + shared_kv_cache_layers: Layer pairings for cross-layer KV sharing. + If an Attention layer `layer_name` is in the keys of this dict, it + means this layer will perform attention using the keys and values + from the KV cache of `shared_kv_cache_layers[layer_name]`. + kv_cache_groups: The KV cache groups of the model. + kv_caches: The allocated kv_caches with layer names as keys. + Note that layers in shared_kv_cache_layers.keys() are not + originally included as it only contains layers which have its own + KV cache allocation. + """ + # Record index of KV cache group for each layer that allocates a KV cache. + layer_to_kv_cache_group_idx: dict[str, int] = {} + for i, kv_cache_group in enumerate(kv_cache_groups): + for layer_name in kv_cache_group.layer_names: + layer_to_kv_cache_group_idx[layer_name] = i + + for layer_name, target_layer_name in shared_kv_cache_layers.items(): + kv_caches[layer_name] = kv_caches[target_layer_name] + group_idx = layer_to_kv_cache_group_idx[target_layer_name] + kv_cache_groups[group_idx].layer_names.append(layer_name) From e31446b6c8d887cdca031abf8527555adee46058 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Tue, 3 Jun 2025 16:48:25 -0400 Subject: [PATCH 030/115] [Perf] Tune `scaled_fp8_quant` by increasing vectorization (#18844) Signed-off-by: mgoin --- csrc/quantization/fp8/common.cu | 35 +++--- csrc/quantization/fp8/common.cuh | 66 +++++------ .../fused_kernels/layernorm_utils.cuh | 107 +++++++++--------- csrc/quantization/vectorization.cuh | 23 ++-- 4 files changed, 118 insertions(+), 113 deletions(-) diff --git a/csrc/quantization/fp8/common.cu b/csrc/quantization/fp8/common.cu index eceb3a8ea05da..f3f9f669e00a4 100644 --- a/csrc/quantization/fp8/common.cu +++ b/csrc/quantization/fp8/common.cu @@ -39,8 +39,8 @@ __global__ void dynamic_per_token_scaled_fp8_quant_kernel( fp8_type* __restrict__ token_output = &out[offset]; // For vectorization, token_input and token_output pointers need to be - // aligned at 8-byte and 4-byte addresses respectively. - bool const can_vectorize = hidden_size % 4 == 0; + // aligned at 32-byte and 16-byte addresses respectively. + bool const can_vectorize = hidden_size % 16 == 0; float absmax_val = 0.0f; if (can_vectorize) { @@ -48,24 +48,24 @@ __global__ void dynamic_per_token_scaled_fp8_quant_kernel( } else { for (int i = tid; i < hidden_size; i += blockDim.x) { float const x = static_cast(token_input[i]); - absmax_val = max(absmax_val, fabs(x)); + absmax_val = fmaxf(absmax_val, fabsf(x)); } } - using BlockReduce = cub::BlockReduce; + using BlockReduce = cub::BlockReduce; __shared__ typename BlockReduce::TempStorage reduceStorage; float const block_absmax_val_maybe = BlockReduce(reduceStorage).Reduce(absmax_val, cub::Max{}, blockDim.x); __shared__ float token_scale; if (tid == 0) { if (scale_ub) { - token_scale = min(block_absmax_val_maybe, *scale_ub); + token_scale = fminf(block_absmax_val_maybe, *scale_ub); } else { token_scale = block_absmax_val_maybe; } // token scale computation - token_scale = max(token_scale / quant_type_max_v, - min_scaling_factor::val()); + token_scale = fmaxf(token_scale / quant_type_max_v, + min_scaling_factor::val()); scale[token_idx] = token_scale; } __syncthreads(); @@ -88,10 +88,11 @@ void static_scaled_fp8_quant(torch::Tensor& out, // [..., d] torch::Tensor const& input, // [..., d] torch::Tensor const& scale) // [1] { - int64_t num_tokens = input.numel() / input.size(-1); - int64_t num_elems = input.numel(); - dim3 grid(num_tokens); - dim3 block(1024); + int const block_size = 256; + int const num_tokens = input.numel() / input.size(-1); + int const num_elems = input.numel(); + dim3 const grid(num_tokens); + dim3 const block(block_size); const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); VLLM_DISPATCH_FLOATING_TYPES( @@ -110,10 +111,11 @@ void dynamic_scaled_fp8_quant(torch::Tensor& out, // [..., d] torch::Tensor const& input, // [..., d] torch::Tensor& scale) // [1] { - int64_t num_tokens = input.numel() / input.size(-1); - int64_t num_elems = input.numel(); - dim3 grid(num_tokens); - dim3 block(1024); + int const block_size = 256; + int const num_tokens = input.numel() / input.size(-1); + int const num_elems = input.numel(); + dim3 const grid(num_tokens); + dim3 const block(block_size); const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); VLLM_DISPATCH_FLOATING_TYPES( @@ -141,8 +143,9 @@ void dynamic_per_token_scaled_fp8_quant( int const hidden_size = input.size(-1); int const num_tokens = input.numel() / hidden_size; + int const block_size = 256; dim3 const grid(num_tokens); - dim3 const block(std::min(hidden_size, 1024)); + dim3 const block(std::min(hidden_size, block_size)); const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); diff --git a/csrc/quantization/fp8/common.cuh b/csrc/quantization/fp8/common.cuh index def8b31b27546..d36f94a8f10d6 100644 --- a/csrc/quantization/fp8/common.cuh +++ b/csrc/quantization/fp8/common.cuh @@ -46,7 +46,7 @@ __device__ __forceinline__ fp8_type scaled_fp8_conversion(float const val, } float r = - fmax(-quant_type_max_v, fmin(x, quant_type_max_v)); + fmaxf(-quant_type_max_v, fminf(x, quant_type_max_v)); #ifndef USE_ROCM return static_cast(r); #else @@ -65,7 +65,7 @@ template __global__ void segmented_max_reduction(float* __restrict__ scale, const scalar_t* __restrict__ input, int64_t num_elems) { - __shared__ float cache[1024]; + __shared__ float cache[256]; int64_t i = blockDim.x * blockIdx.x + threadIdx.x; // First store maximum for all values processes by @@ -73,7 +73,7 @@ __global__ void segmented_max_reduction(float* __restrict__ scale, scalar_t tmp = 0.0; while (i < num_elems) { float x = static_cast(input[i]); - tmp = max(tmp, fabs(x)); + tmp = fmaxf(tmp, fabsf(x)); i += blockDim.x * gridDim.x; } cache[threadIdx.x] = tmp; @@ -100,25 +100,27 @@ template __device__ float thread_max_vec(scalar_t const* __restrict__ input, int64_t const num_elems, int const tid, int const step) { + constexpr size_t VEC_SIZE = 16; + using scalarxN_t = vec_n_t; // Vectorized input/output to better utilize memory bandwidth. - vec4_t const* vectorized_in = - reinterpret_cast const*>(input); + auto const* vectorized_in = reinterpret_cast(input); - int64_t const num_vec_elems = num_elems >> 2; + // num_elems / VEC_SIZE (which is 16) + int64_t const num_vec_elems = num_elems >> 4; float absmax_val = 0.0f; -#pragma unroll 4 +#pragma unroll for (int64_t i = tid; i < num_vec_elems; i += step) { - vec4_t in_vec = vectorized_in[i]; - absmax_val = max(absmax_val, fabs(in_vec.x)); - absmax_val = max(absmax_val, fabs(in_vec.y)); - absmax_val = max(absmax_val, fabs(in_vec.z)); - absmax_val = max(absmax_val, fabs(in_vec.w)); + scalarxN_t in_vec = vectorized_in[i]; +#pragma unroll + for (int j = 0; j < VEC_SIZE; ++j) { + absmax_val = fmaxf(absmax_val, fabsf(in_vec.val[j])); + } } - // Handle the remaining elements if num_elems is not divisible by 4 - for (int64_t i = num_vec_elems * 4 + tid; i < num_elems; i += step) { - absmax_val = max(absmax_val, fabs(input[i])); + // Handle the remaining elements if num_elems is not divisible by VEC_SIZE + for (int64_t i = num_vec_elems * VEC_SIZE + tid; i < num_elems; i += step) { + absmax_val = fmaxf(absmax_val, fabsf(input[i])); } return absmax_val; @@ -130,31 +132,31 @@ __device__ void scaled_fp8_conversion_vec(fp8_type* __restrict__ out, float const scale, int64_t const num_elems, int const tid, int const step) { - using float8x4_t = q8x4_t; + constexpr size_t VEC_SIZE = 16; + using scalarxN_t = vec_n_t; + using float8xN_t = q8_n_t; // Vectorized input/output to better utilize memory bandwidth. - auto const* vectorized_in = reinterpret_cast const*>(input); - auto* vectorized_out = reinterpret_cast(out); + auto const* vectorized_in = reinterpret_cast(input); + auto* vectorized_out = reinterpret_cast(out); - int64_t const num_vec_elems = num_elems >> 2; + // num_elems / VEC_SIZE (which is 16) + int64_t const num_vec_elems = num_elems >> 4; -#pragma unroll 4 +#pragma unroll for (int64_t i = tid; i < num_vec_elems; i += step) { - vec4_t in_vec = vectorized_in[i]; - float8x4_t out_vec; + scalarxN_t in_vec = vectorized_in[i]; + float8xN_t out_vec; - out_vec.x = scaled_fp8_conversion( - static_cast(in_vec.x), scale); - out_vec.y = scaled_fp8_conversion( - static_cast(in_vec.y), scale); - out_vec.z = scaled_fp8_conversion( - static_cast(in_vec.z), scale); - out_vec.w = scaled_fp8_conversion( - static_cast(in_vec.w), scale); +#pragma unroll + for (int j = 0; j < VEC_SIZE; ++j) { + out_vec.val[j] = scaled_fp8_conversion( + static_cast(in_vec.val[j]), scale); + } vectorized_out[i] = out_vec; } - // Handle the remaining elements if num_elems is not divisible by 4 - for (int64_t i = num_vec_elems * 4 + tid; i < num_elems; i += step) { + // Handle the remaining elements if num_elems is not divisible by VEC_SIZE + for (int64_t i = num_vec_elems * VEC_SIZE + tid; i < num_elems; i += step) { out[i] = scaled_fp8_conversion( static_cast(input[i]), scale); } diff --git a/csrc/quantization/fused_kernels/layernorm_utils.cuh b/csrc/quantization/fused_kernels/layernorm_utils.cuh index e6d23cd24e178..3f188872d80d3 100644 --- a/csrc/quantization/fused_kernels/layernorm_utils.cuh +++ b/csrc/quantization/fused_kernels/layernorm_utils.cuh @@ -140,6 +140,7 @@ __device__ void compute_rms(float* rms, scalar_t const* __restrict__ input, // sum of squares float ss = 0.0f; + const int VEC_SIZE = 4; int32_t const num_vec_elems = hidden_size >> 2; #pragma unroll 4 @@ -147,22 +148,23 @@ __device__ void compute_rms(float* rms, scalar_t const* __restrict__ input, vec4_t in = vec_input[i]; vec4_t x; - x.x = static_cast(in.x); - x.y = static_cast(in.y); - x.z = static_cast(in.z); - x.w = static_cast(in.w); - if constexpr (has_residual) { - vec4_t r = vec_residual[i]; - x.x += static_cast(r.x); - x.y += static_cast(r.y); - x.z += static_cast(r.z); - x.w += static_cast(r.w); +#pragma unroll + for (int j = 0; j < VEC_SIZE; ++j) { + x.val[j] = static_cast(in.val[j]); } - ss += x.x * x.x; - ss += x.y * x.y; - ss += x.z * x.z; - ss += x.w * x.w; + if constexpr (has_residual) { + vec4_t r = vec_residual[i]; +#pragma unroll + for (int j = 0; j < VEC_SIZE; ++j) { + x.val[j] += static_cast(r.val[j]); + } + } + +#pragma unroll + for (int j = 0; j < VEC_SIZE; ++j) { + ss += x.val[j] * x.val[j]; + } } using BlockReduce = cub::BlockReduce; @@ -203,6 +205,7 @@ __device__ void compute_dynamic_per_token_scales( constexpr scalar_out_t qmax{quant_type_max_v}; + const int VEC_SIZE = 4; int32_t const num_vec_elems = hidden_size >> 2; float block_absmax_val_maybe = 0.0f; @@ -212,26 +215,25 @@ __device__ void compute_dynamic_per_token_scales( vec4_t const w = vec_weight[i]; vec4_t x; - x.x = static_cast(in.x); - x.y = static_cast(in.y); - x.z = static_cast(in.z); - x.w = static_cast(in.w); - if constexpr (has_residual) { - vec4_t r = vec_residual[i]; - x.x += static_cast(r.x); - x.y += static_cast(r.y); - x.z += static_cast(r.z); - x.w += static_cast(r.w); +#pragma unroll + for (int j = 0; j < VEC_SIZE; ++j) { + x.val[j] = static_cast(in.val[j]); } - block_absmax_val_maybe = fmaxf( - block_absmax_val_maybe, fabs(static_cast(x.x * rms) * w.x)); - block_absmax_val_maybe = fmaxf( - block_absmax_val_maybe, fabs(static_cast(x.y * rms) * w.y)); - block_absmax_val_maybe = fmaxf( - block_absmax_val_maybe, fabs(static_cast(x.z * rms) * w.z)); - block_absmax_val_maybe = fmaxf( - block_absmax_val_maybe, fabs(static_cast(x.w * rms) * w.w)); + if constexpr (has_residual) { + vec4_t r = vec_residual[i]; +#pragma unroll + for (int j = 0; j < VEC_SIZE; ++j) { + x.val[j] += static_cast(r.val[j]); + } + } + +#pragma unroll + for (int j = 0; j < VEC_SIZE; ++j) { + block_absmax_val_maybe = + fmaxf(block_absmax_val_maybe, + fabs(static_cast(x.val[j] * rms) * w.val[j])); + } } using BlockReduce = cub::BlockReduce; @@ -282,6 +284,7 @@ __device__ void norm_and_quant(scalar_out_t* __restrict__ output, vec_residual = reinterpret_cast*>(&residual[token_offset]); } + const int VEC_SIZE = 4; int32_t const num_vec_elems = hidden_size >> 2; // TODO(luka/varun) extract into type-agnostic vectorized quant function to @@ -292,33 +295,31 @@ __device__ void norm_and_quant(scalar_out_t* __restrict__ output, vec4_t const w = vec_weight[i]; vec4_t x; - x.x = static_cast(in.x); - x.y = static_cast(in.y); - x.z = static_cast(in.z); - x.w = static_cast(in.w); +#pragma unroll + for (int j = 0; j < VEC_SIZE; ++j) { + x.val[j] = static_cast(in.val[j]); + } + if constexpr (has_residual) { vec4_t r = vec_residual[i]; - x.x += static_cast(r.x); - x.y += static_cast(r.y); - x.z += static_cast(r.z); - x.w += static_cast(r.w); - // Update residual - r.x = static_cast(x.x); - r.y = static_cast(x.y); - r.z = static_cast(x.z); - r.w = static_cast(x.w); +#pragma unroll + for (int j = 0; j < VEC_SIZE; ++j) { + x.val[j] += static_cast(r.val[j]); + } +// Update residual +#pragma unroll + for (int j = 0; j < VEC_SIZE; ++j) { + r.val[j] = static_cast(x.val[j]); + } vec_residual[i] = r; } q8x4_t out; - out.x = ScaledQuant::quant_fn( - static_cast(x.x * rms) * w.x, scale); - out.y = ScaledQuant::quant_fn( - static_cast(x.y * rms) * w.y, scale); - out.z = ScaledQuant::quant_fn( - static_cast(x.z * rms) * w.z, scale); - out.w = ScaledQuant::quant_fn( - static_cast(x.w * rms) * w.w, scale); +#pragma unroll + for (int j = 0; j < VEC_SIZE; ++j) { + out.val[j] = ScaledQuant::quant_fn( + static_cast(x.val[j] * rms) * w.val[j], scale); + } vec_output[i] = out; } } diff --git a/csrc/quantization/vectorization.cuh b/csrc/quantization/vectorization.cuh index 866da10b5bc14..11d57a5fafe89 100644 --- a/csrc/quantization/vectorization.cuh +++ b/csrc/quantization/vectorization.cuh @@ -10,23 +10,22 @@ namespace vllm { // Vectorization containers -template -struct __align__(8) vec4_t { - scalar_t x; - scalar_t y; - scalar_t z; - scalar_t w; +template +struct __align__(vec_size * sizeof(scalar_t)) vec_n_t { + scalar_t val[vec_size]; }; -template -struct __align__(4) q8x4_t { +template +struct __align__(vec_size * sizeof(quant_type_t)) q8_n_t { static_assert(std::is_same_v || std::is_same_v || std::is_same_v); - quant_type_t x; - quant_type_t y; - quant_type_t z; - quant_type_t w; + quant_type_t val[vec_size]; }; +template +using vec4_t = vec_n_t; +template +using q8x4_t = q8_n_t; + } // namespace vllm From 6865fe0074771ed56c1cb2eca047a8e74ab53ce9 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 3 Jun 2025 22:07:19 +0100 Subject: [PATCH 031/115] Fix interaction between `Optional` and `Annotated` in CLI typing (#19093) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Yikun Jiang --- tests/engine/test_arg_utils.py | 18 +++++++++++++++--- vllm/engine/arg_utils.py | 26 +++++++++++++++++++------- 2 files changed, 34 insertions(+), 10 deletions(-) diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py index ab78aa7da21bd..cfbc7c245ffd4 100644 --- a/tests/engine/test_arg_utils.py +++ b/tests/engine/test_arg_utils.py @@ -5,14 +5,14 @@ import json from argparse import ArgumentError, ArgumentTypeError from contextlib import nullcontext from dataclasses import dataclass, field -from typing import Literal, Optional +from typing import Annotated, Literal, Optional import pytest from vllm.config import CompilationConfig, config from vllm.engine.arg_utils import (EngineArgs, contains_type, get_kwargs, - get_type, is_not_builtin, is_type, - literal_to_kwargs, nullable_kvs, + get_type, get_type_hints, is_not_builtin, + is_type, literal_to_kwargs, nullable_kvs, optional_type, parse_type) from vllm.utils import FlexibleArgumentParser @@ -160,6 +160,18 @@ def test_is_not_builtin(type_hint, expected): assert is_not_builtin(type_hint) == expected +@pytest.mark.parametrize( + ("type_hint", "expected"), [ + (Annotated[int, "annotation"], {int}), + (Optional[int], {int, type(None)}), + (Annotated[Optional[int], "annotation"], {int, type(None)}), + (Optional[Annotated[int, "annotation"]], {int, type(None)}), + ], + ids=["Annotated", "Optional", "Annotated_Optional", "Optional_Annotated"]) +def test_get_type_hints(type_hint, expected): + assert get_type_hints(type_hint) == expected + + def test_get_kwargs(): kwargs = get_kwargs(DummyConfig) print(kwargs) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 587a23134fe90..2197d44ca8259 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -15,7 +15,7 @@ from typing import (Annotated, Any, Callable, Dict, List, Literal, Optional, import regex as re import torch -from pydantic import SkipValidation, TypeAdapter, ValidationError +from pydantic import TypeAdapter, ValidationError from typing_extensions import TypeIs, deprecated import vllm.envs as envs @@ -151,17 +151,29 @@ def is_not_builtin(type_hint: TypeHint) -> bool: return type_hint.__module__ != "builtins" +def get_type_hints(type_hint: TypeHint) -> set[TypeHint]: + """Extract type hints from Annotated or Union type hints.""" + type_hints: set[TypeHint] = set() + origin = get_origin(type_hint) + args = get_args(type_hint) + + if origin is Annotated: + type_hints.update(get_type_hints(args[0])) + elif origin is Union: + for arg in args: + type_hints.update(get_type_hints(arg)) + else: + type_hints.add(type_hint) + + return type_hints + + def get_kwargs(cls: ConfigType) -> dict[str, Any]: cls_docs = get_attr_docs(cls) kwargs = {} for field in fields(cls): # Get the set of possible types for the field - type_hints: set[TypeHint] = set() - if get_origin(field.type) in {Union, Annotated}: - predicate = lambda arg: not isinstance(arg, SkipValidation) - type_hints.update(filter(predicate, get_args(field.type))) - else: - type_hints.add(field.type) + type_hints: set[TypeHint] = get_type_hints(field.type) # If the field is a dataclass, we can use the model_validate_json generator = (th for th in type_hints if is_dataclass(th)) From 6cac54f4d1673991a415b9897d610c132104155b Mon Sep 17 00:00:00 2001 From: Chen Zhang Date: Wed, 4 Jun 2025 05:41:36 +0800 Subject: [PATCH 032/115] [v1] Re-init input batch for multiple kv cache groups (#18654) Signed-off-by: Chen Zhang --- tests/v1/worker/test_gpu_input_batch.py | 29 ++------------- tests/v1/worker/test_gpu_model_runner.py | 4 ++- vllm/v1/worker/block_table.py | 3 +- vllm/v1/worker/gpu_input_batch.py | 18 +++++----- vllm/v1/worker/gpu_model_runner.py | 46 ++++++++++++++++++++---- vllm/v1/worker/tpu_model_runner.py | 7 ++-- 6 files changed, 61 insertions(+), 46 deletions(-) diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py index e932e4b323498..72547e86b0e93 100644 --- a/tests/v1/worker/test_gpu_input_batch.py +++ b/tests/v1/worker/test_gpu_input_batch.py @@ -10,8 +10,6 @@ import torch from vllm.sampling_params import SamplingParams from vllm.utils import is_pin_memory_available, make_tensor_with_pad -from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, - KVCacheGroupSpec, KVCacheTensor) from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.worker.block_table import BlockTable, MultiGroupBlockTable from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch @@ -25,27 +23,6 @@ CUDA_DEVICES = [ MAX_NUM_PROMPT_TOKENS = 64 -def get_kv_cache_config() -> KVCacheConfig: - return KVCacheConfig( - num_blocks=10, - tensors={ - "layer.0": KVCacheTensor(size=1024), - }, - kv_cache_groups=[ - KVCacheGroupSpec( - layer_names=["layer.0"], - kv_cache_spec=FullAttentionSpec( - block_size=1, - num_kv_heads=1, - head_size=16, - dtype=torch.float16, - use_mla=False, - ), - ), - ], - ) - - def _compare_objs(obj1, obj2): attrs = inspect.getmembers(obj1, lambda a: not (inspect.isroutine(a))) attr_names = set([ @@ -252,7 +229,7 @@ def test_sampling_metadata_in_input_batch(device: str, batch_size: int): device=torch.device(device), pin_memory=is_pin_memory_available(), vocab_size=1024, - block_size=1, + block_sizes=[1], ) reqs: list[CachedRequestState] = [] req_id_reqs = {} @@ -342,7 +319,7 @@ def test_swap_states_in_input_batch(device: str, batch_size: int, device=torch.device(device), pin_memory=is_pin_memory_available(), vocab_size=1024, - block_size=1, + block_sizes=[1], ) ref_input_batch: InputBatch = InputBatch( max_num_reqs=batch_size, @@ -351,7 +328,7 @@ def test_swap_states_in_input_batch(device: str, batch_size: int, device=torch.device(device), pin_memory=is_pin_memory_available(), vocab_size=1024, - block_size=1, + block_sizes=[1], ) reqs: list[CachedRequestState] = [] diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py index 5e2fd2fbf747b..0553d94de4c22 100644 --- a/tests/v1/worker/test_gpu_model_runner.py +++ b/tests/v1/worker/test_gpu_model_runner.py @@ -54,7 +54,9 @@ def initialize_kv_cache(runner: GPUModelRunner): device=runner.device, pin_memory=runner.pin_memory, vocab_size=runner.model_config.get_vocab_size(), - block_size=kv_cache_config.kv_cache_groups[0].kv_cache_spec.block_size, + block_sizes=[ + kv_cache_config.kv_cache_groups[0].kv_cache_spec.block_size + ], ) runner.initialize_attn_backend(kv_cache_config) diff --git a/vllm/v1/worker/block_table.py b/vllm/v1/worker/block_table.py index 958262c492462..5cd5674fb5220 100644 --- a/vllm/v1/worker/block_table.py +++ b/vllm/v1/worker/block_table.py @@ -105,10 +105,11 @@ class MultiGroupBlockTable: def __init__(self, max_num_reqs: int, max_model_len: int, max_num_batched_tokens: int, pin_memory: bool, - device: torch.device, block_size: int) -> None: + device: torch.device, block_sizes: list[int]) -> None: self.block_tables = [ BlockTable(max_num_reqs, cdiv(max_model_len, block_size), max_num_batched_tokens, pin_memory, device) + for block_size in block_sizes ] def append_row(self, block_ids: list[list[int]], row_idx: int) -> None: diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index bb986b6047f65..34737029f6bf3 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -56,14 +56,14 @@ class CachedRequestState: class InputBatch: def __init__( - self, - max_num_reqs: int, - max_model_len: int, - max_num_batched_tokens: int, - device: torch.device, - pin_memory: bool, - vocab_size: int, - block_size: int, + self, + max_num_reqs: int, + max_model_len: int, + max_num_batched_tokens: int, + device: torch.device, + pin_memory: bool, + vocab_size: int, + block_sizes: list[int], # The block_size of each kv cache group ): self.max_num_reqs = max_num_reqs self.max_model_len = max_model_len @@ -105,7 +105,7 @@ class InputBatch: max_num_batched_tokens=max_num_batched_tokens, pin_memory=pin_memory, device=device, - block_size=block_size, + block_sizes=block_sizes, ) # Sampling-related. diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index b7448be26f107..6a566a602b190 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -143,7 +143,6 @@ class GPUModelRunner(LoRAModelRunnerMixin): self.attn_metadata_builders: list[AttentionMetadataBuilder] = [] self.attn_backends: list[type[AttentionBackend]] = [] # self.kv_cache_config: KVCacheConfig - # self.input_batch: InputBatch # Persistent batch. # req_id -> (input_id -> encoder_output) self.encoder_cache: dict[str, dict[int, torch.Tensor]] = {} @@ -173,6 +172,15 @@ class GPUModelRunner(LoRAModelRunnerMixin): # Request states. self.requests: dict[str, CachedRequestState] = {} + # Input Batch + # NOTE(Chen): Ideally, we should initialize the input batch inside + # `initialize_kv_cache` based on the kv cache config. However, as in + # https://github.com/vllm-project/vllm/pull/18298, due to some unknown + # reasons, we have to initialize the input batch before `load_model`, + # quantization + weight offloading will fail otherwise. As a temporary + # solution, we initialize the input batch here, and re-initialize it + # in `initialize_kv_cache` if the block_sizes here is different from + # the block_sizes in the kv cache config. self.input_batch = InputBatch( max_num_reqs=self.max_num_reqs, max_model_len=self.max_model_len, @@ -180,7 +188,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): device=self.device, pin_memory=self.pin_memory, vocab_size=self.model_config.get_vocab_size(), - block_size=self.cache_config.block_size, + block_sizes=[self.cache_config.block_size], ) self.use_cuda_graph = (self.vllm_config.compilation_config.level @@ -2040,6 +2048,35 @@ class GPUModelRunner(LoRAModelRunnerMixin): self.attn_backends.append(attn_backend_i) self.attn_metadata_builders.append(attn_metadata_builder_i) + def may_reinitialize_input_batch(self, + kv_cache_config: KVCacheConfig) -> None: + """ + Re-initialize the input batch if the block sizes are different from + `[self.cache_config.block_size]`. This usually happens when there + are multiple KV cache groups. + + Args: + kv_cache_config: The KV cache configuration. + """ + block_sizes = [ + kv_cache_group.kv_cache_spec.block_size + for kv_cache_group in kv_cache_config.kv_cache_groups + ] + if block_sizes != [self.cache_config.block_size]: + assert self.cache_config.cpu_offload_gb == 0, ( + "Cannot re-initialize the input batch when CPU weight " + "offloading is enabled. See https://github.com/vllm-project/vllm/pull/18298 " # noqa: E501 + "for more details.") + self.input_batch = InputBatch( + max_num_reqs=self.max_num_reqs, + max_model_len=self.max_model_len, + max_num_batched_tokens=self.max_num_tokens, + device=self.device, + pin_memory=self.pin_memory, + vocab_size=self.model_config.get_vocab_size(), + block_sizes=block_sizes, + ) + def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None: """ Initialize KV cache based on `kv_cache_config`. @@ -2047,11 +2084,8 @@ class GPUModelRunner(LoRAModelRunnerMixin): kv_cache_config: Configuration for the KV cache, including the KV cache size of each layer """ - if len(kv_cache_config.kv_cache_groups) > 1: - raise NotImplementedError( - "Hybrid models with more than one KV cache type are not " - "supported yet.") self.kv_cache_config = kv_cache_config + self.may_reinitialize_input_batch(kv_cache_config) self.initialize_attn_backend(kv_cache_config) kv_caches: dict[str, torch.Tensor] = {} diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index f15234f49ce05..73c445d14e38e 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -200,7 +200,7 @@ class TPUModelRunner(LoRAModelRunnerMixin): device=self.device, pin_memory=self.pin_memory, vocab_size=self.model_config.get_vocab_size(), - block_size=self.block_size, + block_sizes=[self.block_size], ) # Cached torch/numpy tensor @@ -1358,8 +1358,9 @@ class TPUModelRunner(LoRAModelRunnerMixin): device=self.device, pin_memory=self.pin_memory, vocab_size=self.model_config.get_vocab_size(), - block_size=kv_cache_config.kv_cache_groups[0].kv_cache_spec. - block_size, + block_sizes=[ + kv_cache_config.kv_cache_groups[0].kv_cache_spec.block_size + ], ) # Verify dtype compatibility between block_table_cpu and input_batch assert self.block_table_cpu.dtype == self.input_batch.block_table[ From 135cf55cd1d83cd4e18266e343a59e6d9f87856f Mon Sep 17 00:00:00 2001 From: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com> Date: Tue, 3 Jun 2025 18:26:33 -0400 Subject: [PATCH 033/115] [V1][Spec Decode][Ngram] 1.35x gain -> 1.95x gain on InstructCoder with prompt fix (#18971) --- benchmarks/benchmark_dataset.py | 10 +++++++++- vllm/benchmarks/datasets.py | 14 +++++++++++++- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py index 80a9246aa0b79..5d2a26cd443c0 100644 --- a/benchmarks/benchmark_dataset.py +++ b/benchmarks/benchmark_dataset.py @@ -865,7 +865,15 @@ class InstructCoderDataset(HuggingFaceDataset): for item in self.data: if len(sampled_requests) >= num_requests: break - prompt = f"{item['instruction']}:\n{item['input']}" + prompt = f"{item['input']}\n\n{item['instruction']} Just output \ + the code, do not include any explanation." + + # apply template + prompt = tokenizer.apply_chat_template( + [{"role": "user", "content": prompt}], + add_generation_prompt=True, + tokenize=False, + ) prompt_len = len(tokenizer(prompt).input_ids) sampled_requests.append( SampleRequest( diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index 0ef3e0254cc4f..f795a12568e05 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -880,7 +880,19 @@ class InstructCoderDataset(HuggingFaceDataset): for item in self.data: if len(sampled_requests) >= num_requests: break - prompt = f"{item['instruction']}:\n{item['input']}" + prompt = f"{item['input']}\n\n{item['instruction']} Just output \ + the code, do not include any explanation." + + # apply template + prompt = tokenizer.apply_chat_template( + [{ + "role": "user", + "content": prompt + }], + add_generation_prompt=True, + tokenize=False, + ) + prompt_len = len(tokenizer(prompt).input_ids) sampled_requests.append( SampleRequest( From b5fd9506c14bed640210a7f3d0adb03a024afdbe Mon Sep 17 00:00:00 2001 From: Chen Zhang Date: Wed, 4 Jun 2025 06:30:55 +0800 Subject: [PATCH 034/115] [Bugfix] get_num_blocks_to_allocate with null_block (#19031) Signed-off-by: Chen Zhang --- tests/v1/core/test_specialized_manager.py | 23 ++++++++++++++++++++ vllm/v1/core/block_pool.py | 5 +++-- vllm/v1/core/kv_cache_utils.py | 3 +++ vllm/v1/core/single_type_kv_cache_manager.py | 5 +++-- 4 files changed, 32 insertions(+), 4 deletions(-) diff --git a/tests/v1/core/test_specialized_manager.py b/tests/v1/core/test_specialized_manager.py index c6f7481ddde32..92ce8ea8b8dd7 100644 --- a/tests/v1/core/test_specialized_manager.py +++ b/tests/v1/core/test_specialized_manager.py @@ -144,3 +144,26 @@ def test_sliding_window_remove_skipped_blocks(): # of removed blocks should be [1003, 1002]. manager.remove_skipped_blocks("test", 11) assert_block_id(block_table, [null_block_id] * 4 + original_block_ids[4:]) + + +def test_get_num_blocks_to_allocate(): + block_size = 2 + sliding_window_spec = SlidingWindowSpec( + block_size=block_size, + num_kv_heads=1, + head_size=1, + dtype=torch.float32, + sliding_window=4, # Placeholder value, not related to test result + use_mla=False, + ) + + block_pool = BlockPool(num_gpu_blocks=100, enable_caching=True) + manager = get_sliding_window_manager(sliding_window_spec, block_pool) + cached_blocks_1 = [KVCacheBlock(i + 1) for i in range(10)] + cached_blocks_2 = [block_pool.null_block for _ in range(5) + ] + [KVCacheBlock(i + 1) for i in range(5)] + + assert manager.get_num_blocks_to_allocate("1", 20 * block_size, + cached_blocks_1) == 20 + assert manager.get_num_blocks_to_allocate("2", 20 * block_size, + cached_blocks_2) == 15 diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py index 27eaca49797d8..5118e4d8e6147 100644 --- a/vllm/v1/core/block_pool.py +++ b/vllm/v1/core/block_pool.py @@ -63,6 +63,7 @@ class BlockPool: # The ref_cnt of null_block is not maintained, needs special care to # avoid freeing it. self.null_block = self.free_block_queue.popleft() + self.null_block.is_null = True self.enable_kv_cache_events = enable_kv_cache_events self.kv_event_queue: list[KVCacheEvent] = [] @@ -252,7 +253,7 @@ class BlockPool: for block in blocks: # ref_cnt=0 means this block is in the free list (i.e. eviction # candidate), so remove it. - if block.ref_cnt == 0 and block != self.null_block: + if block.ref_cnt == 0 and not block.is_null: self.free_block_queue.remove(block) block.incr_ref() @@ -267,7 +268,7 @@ class BlockPool: for block in ordered_blocks: block.decr_ref() # null_block should not be added to the free list. - if block.ref_cnt == 0 and block != self.null_block: + if block.ref_cnt == 0 and not block.is_null: self.free_block_queue.append(block) def reset_prefix_cache(self) -> bool: diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 61476362e3024..3b5a379267e5a 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -125,6 +125,9 @@ class KVCacheBlock: prev_free_block: Optional["KVCacheBlock"] = None next_free_block: Optional["KVCacheBlock"] = None + # Whether the block is a null block that should never be cached. + is_null: bool = False + def incr_ref(self): self.ref_cnt += 1 diff --git a/vllm/v1/core/single_type_kv_cache_manager.py b/vllm/v1/core/single_type_kv_cache_manager.py index 233c73e882398..a529cde097f5b 100644 --- a/vllm/v1/core/single_type_kv_cache_manager.py +++ b/vllm/v1/core/single_type_kv_cache_manager.py @@ -83,8 +83,9 @@ class SingleTypeKVCacheManager(ABC): # free queue and ref_cnt == 0), it will be changed from a free block # to a computed block when the request is allocated, so we also count # it as needed to be allocated. - num_evictable_computed_blocks = sum(blk.ref_cnt == 0 - for blk in new_computed_blocks) + num_evictable_computed_blocks = sum( + blk.ref_cnt == 0 and not blk.is_null + for blk in new_computed_blocks) return ((num_new_blocks + num_evictable_computed_blocks) * self.num_kv_cache_groups) From 4de790fcad85abb0969da18bc9125889407c432a Mon Sep 17 00:00:00 2001 From: Chauncey Date: Wed, 4 Jun 2025 07:27:24 +0800 Subject: [PATCH 035/115] [Bugfix]: Fix the incompatibility issue with tool_choice 'required' when Thinking is enabled (#19075) Signed-off-by: chaunceyjiang --- .../test_completion_with_function_calling.py | 2 +- vllm/entrypoints/openai/serving_chat.py | 18 +++++++++++++++--- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/tests/entrypoints/openai/test_completion_with_function_calling.py b/tests/entrypoints/openai/test_completion_with_function_calling.py index dbea2dc0b0782..5c1f07832c2e9 100644 --- a/tests/entrypoints/openai/test_completion_with_function_calling.py +++ b/tests/entrypoints/openai/test_completion_with_function_calling.py @@ -9,7 +9,7 @@ import pytest_asyncio from ...utils import RemoteOpenAIServer # any model with a chat template should work here -MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct" +MODEL_NAME = "Qwen/Qwen3-0.6B" @pytest.fixture(scope="module") diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 7e514d660be41..777b7f5bcde5a 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -320,10 +320,13 @@ class OpenAIServingChat(OpenAIServing): def extract_tool_call_required_streaming( self, previous_text: str, - current_text: str, + current_text: Optional[str], delta_text: str, function_name_returned: bool, ) -> tuple[Optional[DeltaMessage], bool]: + if current_text is None or current_text == "": + # if the current text is empty, we cannot parse it + return None, function_name_returned try: obj = partial_json_parser.loads(current_text) except partial_json_parser.core.exceptions.MalformedJSON: @@ -650,10 +653,18 @@ class OpenAIServingChat(OpenAIServing): current_text = previous_text + delta_text fn_name_returned = function_name_returned[i] + if self.reasoning_parser: + _, content = \ + reasoning_parser.extract_reasoning_content( + current_text, + request + ) + else: + content = current_text delta_message, function_name_returned[i] = ( self.extract_tool_call_required_streaming( previous_text=previous_text, - current_text=current_text, + current_text=content, delta_text=delta_text, function_name_returned=fn_name_returned)) @@ -981,8 +992,9 @@ class OpenAIServingChat(OpenAIServing): # the fields of FunctionDefinition are a superset of the # tool call outputs and can be used for parsing + assert content is not None tool_calls = TypeAdapter( - list[FunctionDefinition]).validate_json(output.text) + list[FunctionDefinition]).validate_json(content) message = ChatMessage( role=role, content="", From 5d96533e2235c37e64ef381fafa244db197b25dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= Date: Wed, 4 Jun 2025 01:53:16 +0200 Subject: [PATCH 036/115] [Bugfix][P/D] Fix Prefix Cache Bug (#18411) Signed-off-by: nicklucche Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> --- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index 3f0b0e2952196..fd22280126d62 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -739,7 +739,8 @@ class NixlConnectorWorker: # just notify P worker that we have the blocks we need. num_local_blocks = len(local_block_ids) if num_local_blocks == 0: - self.nixl_wrapper.send_notif(dst_engine_id, + agent_name = self._remote_agents[dst_engine_id] + self.nixl_wrapper.send_notif(agent_name, notif_msg=request_id.encode("utf-8")) return From a8da78eac92b5e79947a6fdd51bec0d1e5cea0a7 Mon Sep 17 00:00:00 2001 From: Chen Zhang Date: Wed, 4 Jun 2025 08:14:06 +0800 Subject: [PATCH 037/115] [Bugfix] Max concurrency estimation and check_enough_kv_cache_memory for models with sliding window layers (#19029) Signed-off-by: Chen Zhang --- tests/v1/core/test_kv_cache_utils.py | 90 +++++++++++++++++++++++++--- vllm/v1/core/kv_cache_utils.py | 61 +++++++++++++------ 2 files changed, 125 insertions(+), 26 deletions(-) diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py index ad34becb1e8db..71ea43383a7e4 100644 --- a/tests/v1/core/test_kv_cache_utils.py +++ b/tests/v1/core/test_kv_cache_utils.py @@ -12,13 +12,11 @@ from vllm.utils import GiB_bytes, sha256 from vllm.v1.core.kv_cache_manager import KVCacheManager # disable yapf here as it formats differently than isort such that both fail # yapf: disable -from vllm.v1.core.kv_cache_utils import (FreeKVCacheBlockQueue, KVCacheBlock, - PrefixCachingMetrics, - estimate_max_model_len, - generate_block_hash_extra_keys, - hash_block_tokens, - hash_request_tokens, - unify_kv_cache_configs) +from vllm.v1.core.kv_cache_utils import ( + FreeKVCacheBlockQueue, KVCacheBlock, PrefixCachingMetrics, + estimate_max_model_len, generate_block_hash_extra_keys, + get_max_concurrency_for_kv_cache_config, hash_block_tokens, + hash_request_tokens, unify_kv_cache_configs) from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, KVCacheGroupSpec, KVCacheTensor, SlidingWindowSpec) @@ -597,6 +595,84 @@ def test_estimate_max_model_len(model_id, max_model_len, assert estimated_max_len == want_estimated_max_len +def test_get_max_concurrency_for_kv_cache_config(): + # Create a VllmConfig + model_id = "Qwen/Qwen1.5-7B" + max_model_len = 16384 + model_config = ModelConfig( + model_id, + task="generate", + tokenizer=model_id, + tokenizer_mode="auto", + trust_remote_code=False, + seed=0, + dtype="float16", + max_model_len=max_model_len, + ) + scheduler_config = SchedulerConfig(max_num_batched_tokens=1024, + enable_chunked_prefill=True) + + vllm_config = VllmConfig( + model_config=model_config, + scheduler_config=scheduler_config, + ) + + full_attention_spec = FullAttentionSpec( + block_size=16, + num_kv_heads=32, + head_size=128, + dtype=torch.float16, + use_mla=False, + ) + + sliding_window_spec = SlidingWindowSpec( + block_size=16, + num_kv_heads=32, + head_size=128, + dtype=torch.float16, + use_mla=False, + sliding_window=1024, + ) + + kv_cache_config_full_attention = KVCacheConfig( + num_blocks=int(1024 * 1.5), + tensors={}, + kv_cache_groups=[ + KVCacheGroupSpec([f"layer_{i}" for i in range(32)], + full_attention_spec), + ], + ) + max_concurrency_full_attention = get_max_concurrency_for_kv_cache_config( + vllm_config, kv_cache_config_full_attention) + assert max_concurrency_full_attention == 1.5 + + kv_cache_config_sliding_window = KVCacheConfig( + num_blocks=129 * 3, + tensors={}, + kv_cache_groups=[ + KVCacheGroupSpec([f"layer_{i}" for i in range(32)], + sliding_window_spec), + ], + ) + max_concurrency_sliding_window = get_max_concurrency_for_kv_cache_config( + vllm_config, kv_cache_config_sliding_window) + assert max_concurrency_sliding_window == 3 + + kv_cache_config_hybrid_model = KVCacheConfig( + num_blocks=(1024 + 129) * 3, + tensors={}, + kv_cache_groups=[ + KVCacheGroupSpec([f"layer_{i}" for i in range(32)], + full_attention_spec), + KVCacheGroupSpec([f"layer_{i}" for i in range(32, 64)], + sliding_window_spec), + ], + ) + max_concurrency_hybrid_model = get_max_concurrency_for_kv_cache_config( + vllm_config, kv_cache_config_hybrid_model) + assert max_concurrency_hybrid_model == 3 + + def test_allocate_with_lookahead(): """Verify that lookahead tokens correctly affect block allocation""" block_size = 4 diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 3b5a379267e5a..ad3c21f794b94 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -3,13 +3,13 @@ """KV-Cache Utilities.""" import os from collections import deque -from collections.abc import Sequence +from collections.abc import Iterable, Sequence from dataclasses import dataclass from typing import Any, Callable, NamedTuple, Optional from vllm.config import VllmConfig from vllm.logger import init_logger -from vllm.utils import GiB_bytes, sha256 +from vllm.utils import GiB_bytes, cdiv, sha256 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, KVCacheGroupSpec, KVCacheSpec, KVCacheTensor, SlidingWindowSpec) @@ -468,6 +468,15 @@ def hash_request_tokens(hash_function: Any, block_size: int, return ret +def max_memory_usage_bytes(vllm_config: VllmConfig, + kv_cache_specs: Iterable[KVCacheSpec]) -> int: + """ + Get the maximum memory usage in bytes for the given KV cache specs. + """ + return sum( + spec.max_memory_usage_bytes(vllm_config) for spec in kv_cache_specs) + + def estimate_max_model_len(vllm_config: VllmConfig, kv_cache_spec: dict[str, KVCacheSpec], available_memory: int) -> int: @@ -489,11 +498,8 @@ def estimate_max_model_len(vllm_config: VllmConfig, # Modify the max_model_len for this calculation vllm_config.model_config.max_model_len = model_len # Calculate memory needed for the given model length - memory_needed = sum( - (layer_spec.max_memory_usage_bytes(vllm_config) - for layer_spec in kv_cache_spec.values()), - start=0, - ) + memory_needed = max_memory_usage_bytes(vllm_config, + kv_cache_spec.values()) return memory_needed <= available_memory # Binary search for the maximum model length @@ -538,9 +544,7 @@ def check_enough_kv_cache_memory(vllm_config: VllmConfig, "initializing the engine.") max_model_len = vllm_config.model_config.max_model_len - needed_memory = 0 - for layer_spec in kv_cache_spec.values(): - needed_memory += layer_spec.max_memory_usage_bytes(vllm_config) + needed_memory = max_memory_usage_bytes(vllm_config, kv_cache_spec.values()) if needed_memory > available_memory: # Estimate the maximum model length that can fit in the available memory @@ -606,6 +610,24 @@ def is_kv_cache_type_uniform(kv_cache_spec: dict[str, KVCacheSpec]) -> bool: return len(layer_keys) == 1 +def get_max_concurrency_for_kv_cache_config( + vllm_config: VllmConfig, kv_cache_config: KVCacheConfig) -> float: + """ + Get the maximum concurrency for the given KV cache configuration. + """ + num_layer_per_group = max( + len(group.layer_names) for group in kv_cache_config.kv_cache_groups) + max_memory_usage_per_request = num_layer_per_group * max_memory_usage_bytes( + vllm_config, + (group.kv_cache_spec for group in kv_cache_config.kv_cache_groups)) + memory_per_block = kv_cache_config.kv_cache_groups[ + 0].kv_cache_spec.page_size_bytes * num_layer_per_group + num_block_per_request = cdiv(max_memory_usage_per_request, + memory_per_block) + max_concurrency = kv_cache_config.num_blocks / num_block_per_request + return max_concurrency + + def _get_kv_cache_config_uniform_type(vllm_config: VllmConfig, kv_cache_spec: dict[str, KVCacheSpec], available_memory: int) -> KVCacheConfig: @@ -637,14 +659,6 @@ def _get_kv_cache_config_uniform_type(vllm_config: VllmConfig, "num_gpu_blocks_override=%d", num_blocks, num_gpu_blocks_override) num_blocks = num_gpu_blocks_override - num_tokens = num_blocks * vllm_config.cache_config.block_size - num_tokens_str = f"{num_tokens:,}" - logger.info("GPU KV cache size: %s tokens", num_tokens_str) - max_model_len_str = f"{vllm_config.model_config.max_model_len:,}" - max_concurrency = num_tokens / vllm_config.model_config.max_model_len - logger.info("Maximum concurrency for %s tokens per request: %.2fx", - max_model_len_str, max_concurrency) - per_layer_size = page_size * num_blocks # All layers have the same KV cache spec, so we create one kv cache group # for all layers. @@ -659,6 +673,15 @@ def _get_kv_cache_config_uniform_type(vllm_config: VllmConfig, kv_cache_groups=create_kv_cache_group_specs(kv_cache_spec, grouped_layer_names), ) + + num_tokens = num_blocks * vllm_config.cache_config.block_size + num_tokens_str = f"{num_tokens:,}" + logger.info("GPU KV cache size: %s tokens", num_tokens_str) + max_model_len_str = f"{vllm_config.model_config.max_model_len:,}" + max_concurrency = get_max_concurrency_for_kv_cache_config( + vllm_config, kv_cache_config) + logger.info("Maximum concurrency for %s tokens per request: %.2fx", + max_model_len_str, max_concurrency) return kv_cache_config @@ -705,8 +728,8 @@ def get_kv_cache_config(vllm_config: VllmConfig, Returns: The generated KVCacheConfigs """ - check_enough_kv_cache_memory(vllm_config, kv_cache_spec, available_memory) unify_hybrid_kv_cache_specs(kv_cache_spec) + check_enough_kv_cache_memory(vllm_config, kv_cache_spec, available_memory) if is_kv_cache_type_uniform(kv_cache_spec): # KV cache of all layers are the same, which is true for # most models. Allocate the same amount of memory for From b712be98c790794479030313f2c2b9dae17ea7de Mon Sep 17 00:00:00 2001 From: Yan Ru Pei Date: Tue, 3 Jun 2025 17:14:20 -0700 Subject: [PATCH 038/115] feat: add data parallel rank to KVEventBatch (#18925) --- .buildkite/test-pipeline.yaml | 2 + tests/distributed/conftest.py | 101 ++++++----- tests/distributed/test_events.py | 69 +++++++- tests/v1/engine/test_engine_core_client.py | 189 +++++++++++++++++---- vllm/distributed/kv_events.py | 77 ++++++++- vllm/v1/core/sched/scheduler.py | 4 +- 6 files changed, 359 insertions(+), 83 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 5fb8ceaace05d..8ab96b3b7ac3c 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -145,6 +145,7 @@ steps: - examples/offline_inference/rlhf_colocate.py - tests/examples/offline_inference/data_parallel.py - tests/v1/test_async_llm_dp.py + - tests/v1/engine/test_engine_core_client.py commands: # test with tp=2 and external_dp=2 - VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py @@ -154,6 +155,7 @@ steps: # test with internal dp - python3 ../examples/offline_inference/data_parallel.py - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py + - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp - pytest -v -s distributed/test_utils.py - pytest -v -s compile/test_basic_correctness.py - pytest -v -s distributed/test_pynccl.py diff --git a/tests/distributed/conftest.py b/tests/distributed/conftest.py index 95f085788b856..666a715cc0da1 100644 --- a/tests/distributed/conftest.py +++ b/tests/distributed/conftest.py @@ -13,11 +13,13 @@ from vllm.distributed.kv_events import EventPublisherFactory from .test_events import SampleBatch +DP_RANK = 0 + @pytest.fixture def random_port(): """Generate a random port number for testing""" - return random.randint(10000, 60000) + return random.randint(10000, 59900) @pytest.fixture @@ -30,21 +32,23 @@ def publisher_config(random_port, request): replay_endpoint = endpoint + "-replay" else: endpoint = f"tcp://*:{random_port}" - replay_endpoint = f"tcp://*:{random_port + 1}" + replay_endpoint = f"tcp://*:{random_port + 100}" - return KVEventsConfig(enable_kv_cache_events=True, - publisher="zmq", - endpoint=endpoint, - replay_endpoint=replay_endpoint, - buffer_steps=100, - hwm=1000, - topic="test") + return KVEventsConfig( + enable_kv_cache_events=True, + publisher="zmq", + endpoint=endpoint, + replay_endpoint=replay_endpoint, + buffer_steps=100, + hwm=1000, + topic="test", + ) @pytest.fixture def publisher(publisher_config): """Create and return a publisher instance""" - pub = EventPublisherFactory.create(publisher_config) + pub = EventPublisherFactory.create(publisher_config, DP_RANK) yield pub pub.shutdown() @@ -60,7 +64,11 @@ def subscriber(publisher_config): if replay_endpoint and replay_endpoint.startswith("tcp://*"): replay_endpoint = replay_endpoint.replace("*", "127.0.0.1") - sub = MockSubscriber(endpoint, replay_endpoint, publisher_config.topic) + sub = MockSubscriber( + [endpoint], + [replay_endpoint] if replay_endpoint else None, + publisher_config.topic, + ) yield sub sub.close() @@ -68,26 +76,37 @@ def subscriber(publisher_config): class MockSubscriber: """Helper class to receive and verify published events""" - def __init__(self, - pub_endpoint: str, - replay_endpoint: Optional[str] = None, - topic: str = "", - decode_type=SampleBatch): + def __init__( + self, + pub_endpoints: Union[str, list[str]], + replay_endpoints: Optional[Union[str, list[str]]] = None, + topic: str = "", + decode_type=SampleBatch, + ): self.ctx = zmq.Context.instance() - # Set up subscriber socket - self.sub = self.ctx.socket(zmq.SUB) - self.sub.setsockopt(zmq.SUBSCRIBE, topic.encode('utf-8')) - self.sub.connect(pub_endpoint) + # Convert single endpoint to list for consistency + if isinstance(pub_endpoints, str): + pub_endpoints = [pub_endpoints] + if isinstance(replay_endpoints, str): + replay_endpoints = [replay_endpoints] - # Set up replay socket if provided - self.replay = None - if replay_endpoint: - self.replay = self.ctx.socket(zmq.REQ) - self.replay.connect(replay_endpoint) + # Set up subscriber socket - connect to all endpoints + self.sub = self.ctx.socket(zmq.SUB) + self.sub.setsockopt(zmq.SUBSCRIBE, topic.encode("utf-8")) + for endpoint in pub_endpoints: + self.sub.connect(endpoint) + + # Set up replay sockets if provided + self.replay_sockets = [] + if replay_endpoints: + for replay_endpoint in replay_endpoints: + replay = self.ctx.socket(zmq.REQ) + replay.connect(replay_endpoint) + self.replay_sockets.append(replay) self.topic = topic - self.topic_bytes = topic.encode('utf-8') + self.topic_bytes = topic.encode("utf-8") self.received_msgs: list[tuple[int, SampleBatch]] = [] self.last_seq = -1 self.decoder = msgspec.msgpack.Decoder(type=decode_type) @@ -107,25 +126,31 @@ class MockSubscriber: self.received_msgs.append((seq, data)) return seq, data - def request_replay(self, start_seq: int) -> None: + def request_replay(self, start_seq: int, socket_idx: int = 0) -> None: """Request replay of messages starting from start_seq""" - if not self.replay: - raise ValueError("Replay socket not initialized") + if not self.replay_sockets: + raise ValueError("Replay sockets not initialized") + if socket_idx >= len(self.replay_sockets): + raise ValueError(f"Invalid socket index {socket_idx}") - self.replay.send(start_seq.to_bytes(8, "big")) + self.replay_sockets[socket_idx].send(start_seq.to_bytes(8, "big")) - def receive_replay(self) -> list[tuple[int, SampleBatch]]: - """Receive replayed messages""" - if not self.replay: - raise ValueError("Replay socket not initialized") + def receive_replay(self, + socket_idx: int = 0) -> list[tuple[int, SampleBatch]]: + """Receive replayed messages from a specific replay socket""" + if not self.replay_sockets: + raise ValueError("Replay sockets not initialized") + if socket_idx >= len(self.replay_sockets): + raise ValueError(f"Invalid socket index {socket_idx}") + replay_socket = self.replay_sockets[socket_idx] replayed: list[tuple[int, SampleBatch]] = [] while True: try: - if not self.replay.poll(1000): + if not replay_socket.poll(1000): break - frames = self.replay.recv_multipart() + frames = replay_socket.recv_multipart() if not frames or not frames[-1]: # End of replay marker break @@ -142,5 +167,5 @@ class MockSubscriber: def close(self): """Clean up resources""" self.sub.close() - if self.replay: - self.replay.close() + for replay in self.replay_sockets: + replay.close() diff --git a/tests/distributed/test_events.py b/tests/distributed/test_events.py index ec1e5a2d62f11..8be9ee0a1889d 100644 --- a/tests/distributed/test_events.py +++ b/tests/distributed/test_events.py @@ -9,6 +9,8 @@ import pytest from vllm.distributed.kv_events import (EventBatch, EventPublisherFactory, NullEventPublisher) +DP_RANK = 0 + class EventSample( msgspec.Struct, @@ -121,7 +123,7 @@ def test_topic_filtering(publisher_config): publisher_config.replay_endpoint = None publisher_config.topic = "foo" - pub = EventPublisherFactory.create(publisher_config) + pub = EventPublisherFactory.create(publisher_config, DP_RANK) from .conftest import MockSubscriber sub_foo = MockSubscriber(publisher_config.endpoint, None, "foo") @@ -185,9 +187,72 @@ def test_high_volume(publisher, subscriber): def test_null_publisher(): """Test that NullEventPublisher can be used without errors""" - publisher = NullEventPublisher() + publisher = NullEventPublisher(DP_RANK) # This should not raise any errors batch = create_test_events(5) publisher.publish(batch) publisher.shutdown() + + +def test_data_parallel_rank_tagging(publisher_config): + """Test that events are properly tagged with their data parallel rank""" + + publisher_config.topic = "foo" + pub_0 = EventPublisherFactory.create(publisher_config, DP_RANK) + pub_1 = EventPublisherFactory.create(publisher_config, DP_RANK + 1) + + # Hardcode the expected endpoints based on port offsetting behavior + # Both ranks get offsets according to _offset_endpoint_port function + base_endpoint = publisher_config.endpoint + if "tcp://" in base_endpoint: + # For TCP endpoints: tcp://localhost:5557 -> tcp://localhost:5557, tcp://localhost:5558 + expected_endpoint_0 = base_endpoint # rank 0 gets port + 0 = same port + expected_endpoint_1 = base_endpoint.replace( + ":5557", ":5558") # rank 1 gets port + 1 + else: + # For inproc endpoints: inproc://test -> inproc://test_dp0, inproc://test_dp1 + expected_endpoint_0 = base_endpoint # rank 0 gets base + expected_endpoint_1 = base_endpoint + "_dp1" # rank 1 gets _dp1 + + from .conftest import MockSubscriber + sub_0 = MockSubscriber(expected_endpoint_0, None, publisher_config.topic) + sub_1 = MockSubscriber(expected_endpoint_1, None, publisher_config.topic) + + try: + time.sleep(0.1) # Let publishers start up + + # Publish events from different ranks + batch_0 = create_test_events(2) + batch_1 = create_test_events(3) + + pub_0.publish(batch_0) + pub_1.publish(batch_1) + + # Receive events from rank 0 + result_0 = sub_0.receive_one(timeout=200) + assert result_0 is not None, "No message received from rank 0" + seq_0, received_0 = result_0 + + # Receive events from rank 1 + result_1 = sub_1.receive_one(timeout=200) + assert result_1 is not None, "No message received from rank 1" + seq_1, received_1 = result_1 + + # Verify DP rank tagging + assert received_0.data_parallel_rank == 0, ( + f"Expected DP rank 0, got {received_0.data_parallel_rank}") + assert received_1.data_parallel_rank == 1, ( + f"Expected DP rank 1, got {received_1.data_parallel_rank}") + + # Verify event content is correct + assert len( + received_0.events) == 2, "Wrong number of events from rank 0" + assert len( + received_1.events) == 3, "Wrong number of events from rank 1" + + finally: + pub_0.shutdown() + pub_1.shutdown() + sub_0.close() + sub_1.close() diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py index a01b205dfaed5..47181d36f4ccc 100644 --- a/tests/v1/engine/test_engine_core_client.py +++ b/tests/v1/engine/test_engine_core_client.py @@ -12,8 +12,10 @@ from typing import Optional import pytest from transformers import AutoTokenizer +from tests.utils import multi_gpu_test from vllm import SamplingParams -from vllm.distributed.kv_events import BlockStored, KVEventBatch +from vllm.distributed.kv_events import (BlockStored, KVEventBatch, + ZmqEventPublisher) from vllm.engine.arg_utils import EngineArgs from vllm.platforms import current_platform from vllm.usage.usage_lib import UsageContext @@ -37,10 +39,15 @@ PROMPT = "Hello my name is Robert and I love quantization kernels" PROMPT_TOKENS = TOKENIZER(PROMPT).input_ids -def make_request(params: SamplingParams) -> EngineCoreRequest: +def make_request( + params: SamplingParams, + prompt_tokens_ids: Optional[list[int]] = None) -> EngineCoreRequest: + if not prompt_tokens_ids: + prompt_tokens_ids = PROMPT_TOKENS + return EngineCoreRequest( request_id=str(uuid.uuid4()), - prompt_token_ids=PROMPT_TOKENS, + prompt_token_ids=prompt_tokens_ids, mm_inputs=None, mm_hashes=None, mm_placeholders=None, @@ -88,6 +95,25 @@ async def loop_until_done_async(client: EngineCoreClient, outputs: dict): break +async def loop_until_fully_done_async(client: EngineCoreClient, outputs: dict): + + while True: + engine_core_outputs = (await client.get_output_async()).outputs + + if len(engine_core_outputs) == 0: + continue + + # Add outputs to the dict + for out in engine_core_outputs: + outputs[out.request_id].append(out) + + # Check if all request IDs in outputs have finished + if all(outs and outs[-1].finished for outs in outputs.values()): + break + + await asyncio.sleep(0.1) + + # Dummy utility function to monkey-patch into engine core. def echo(self, msg: str, err_msg: Optional[str] = None) -> str: print(f"echo util function called: {msg}, {err_msg}") @@ -273,10 +299,12 @@ def test_kv_cache_events( block_size = 16 num_blocks = 2 - engine_args = EngineArgs(model=MODEL_NAME, - enforce_eager=True, - enable_prefix_caching=True, - block_size=block_size) + engine_args = EngineArgs( + model=MODEL_NAME, + enforce_eager=True, + enable_prefix_caching=True, + block_size=block_size, + ) engine_args.kv_events_config = publisher_config vllm_config = engine_args.create_engine_config( @@ -297,19 +325,8 @@ def test_kv_cache_events( try: custom_tokens = list(range(num_blocks * block_size)) - request = EngineCoreRequest( - request_id=str(uuid.uuid4()), - prompt_token_ids=custom_tokens, - mm_inputs=None, - mm_hashes=None, - mm_placeholders=None, - sampling_params=SamplingParams( - max_tokens=1), # Short completion for speed - eos_token_id=None, - arrival_time=time.time(), - lora_request=None, - cache_salt=None, - ) + sampling_params = SamplingParams(max_tokens=1) + request = make_request(sampling_params, custom_tokens) client.add_request(request) outputs: dict[str, list] = {request.request_id: []} @@ -321,24 +338,130 @@ def test_kv_cache_events( seq, received = result assert seq == 0, "Sequence number mismatch" - assert len(received.events) == 1, ( - "We should have exactly one BlockStored event") + assert (len(received.events) == 1 + ), "We should have exactly one BlockStored event" event = received.events[0] assert isinstance( - event, BlockStored), ("We should have a BlockStored event") - assert len(event.block_hashes) == num_blocks, ( - "We should have a BlockStored event with 2 block_hashes") - assert event.block_size == block_size, ( - "Block size should be the same as the block size") - assert event.parent_block_hash is None, ( - "Parent block hash should be None") + event, BlockStored), "We should have a BlockStored event" + assert (len(event.block_hashes) == num_blocks + ), "We should have a BlockStored event with 2 block_hashes" + assert (event.block_size == block_size + ), "Block size should be the same as the block size" + assert (event.parent_block_hash + is None), "Parent block hash should be None" assert event.lora_id is None, "Lora id should be None" - assert len(event.token_ids) == num_blocks * block_size, ( - "Token ids should be the same as the custom tokens") - assert event.token_ids == custom_tokens, ( - "Token ids should be the same as the custom tokens") + assert (len(event.token_ids) == num_blocks * block_size + ), "Token ids should be the same as the custom tokens" + assert (event.token_ids == custom_tokens + ), "Token ids should be the same as the custom tokens" finally: client.shutdown() + subscriber.close() + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "multiprocessing_mode,publisher_config", + [(True, "tcp")], + indirect=["publisher_config"], +) +@multi_gpu_test(num_gpus=4) +async def test_kv_cache_events_dp( + monkeypatch: pytest.MonkeyPatch, + multiprocessing_mode: bool, + publisher_config, +): + + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1") + block_size = 16 + num_blocks = 2 + dp_size = 2 + tp_size = 2 + + engine_args = EngineArgs( + model=MODEL_NAME, + enforce_eager=True, + enable_prefix_caching=True, + data_parallel_size=dp_size, + tensor_parallel_size=tp_size, + block_size=block_size, + ) + engine_args.kv_events_config = publisher_config + + vllm_config = engine_args.create_engine_config( + UsageContext.UNKNOWN_CONTEXT) + + executor_class = Executor.get_class(vllm_config) + client = EngineCoreClient.make_client( + multiprocess_mode=multiprocessing_mode, + asyncio_mode=True, + vllm_config=vllm_config, + executor_class=executor_class, + log_stats=False, + ) + await asyncio.sleep(1) + + # Build endpoints for all DP ranks + base_endpoint = publisher_config.endpoint.replace("*", "127.0.0.1") + endpoints = [] + for i in range(dp_size): + offset_endpoint = ZmqEventPublisher.offset_endpoint_port( + base_endpoint, i) + endpoints.append(offset_endpoint) + + subscriber = MockSubscriber(endpoints, + topic=publisher_config.topic, + decode_type=KVEventBatch) + + try: + custom_tokens = list(range(num_blocks * block_size)) + sampling_params = SamplingParams(max_tokens=1) + all_request_ids = [] + + # Create and add 25 requests + # NOTE: attempts to force routing to both dp groups but can be flaky + for i in range(25): + await asyncio.sleep(0.01) + request = make_request(sampling_params, custom_tokens) + await client.add_request_async(request) + all_request_ids.append(request.request_id) + + await asyncio.sleep(0.1) + + # Initialize outputs dict for all requests + outputs: dict[str, list] = { + req_id: [] + for req_id in all_request_ids + } + + print("processing requests...") + await asyncio.wait_for(loop_until_fully_done_async( + client, outputs), + timeout=20.0) + + # Receive from subscriber until no more messages + print("collecting results...") + results = [] + while True: + result = subscriber.receive_one(timeout=1) + print(result) + if result is None: + break + results.append(result) + + # Collect all events and data_parallel_ranks from all results + all_dp_ranks = [ + received.data_parallel_rank for (_, received) in results + ] + unique_dps = set(all_dp_ranks) + assert ( + len(unique_dps) == 2 + ), f"Expected 2 unique data_parallel_ranks, got {len(unique_dps)}" + + finally: + client.shutdown() + subscriber.close() @pytest.mark.timeout(20) diff --git a/vllm/distributed/kv_events.py b/vllm/distributed/kv_events.py index 9bf1c058a1915..2d7935773dd9f 100644 --- a/vllm/distributed/kv_events.py +++ b/vllm/distributed/kv_events.py @@ -28,6 +28,7 @@ class EventBatch( ): ts: float events: list[Any] + data_parallel_rank: Optional[int] = None class KVCacheEvent( @@ -60,7 +61,22 @@ class KVEventBatch(EventBatch): class EventPublisher(ABC): - """Lightweight publisher for EventBatch batches.""" + """Lightweight publisher for EventBatch batches with data parallelism + support. + + In data parallel setups, each DP rank runs its own EventPublisher instance + to avoid duplicate events and ensure proper event attribution: + + - Each DP rank creates a separate publisher + - Publishers automatically annotate events with their data_parallel_rank + - This allows consumers to distinguish events from different DP ranks + + The publisher is responsible for adding DP metadata since the scheduler + operates independently of DP topology and shouldn't need DP awareness. + """ + + def __init__(self, data_parallel_rank: int = 0) -> None: + self._data_parallel_rank = data_parallel_rank @abstractmethod def publish(self, events: EventBatch) -> None: @@ -113,6 +129,7 @@ class ZmqEventPublisher(EventPublisher): def __init__( self, + data_parallel_rank: int, endpoint: str = "tcp://*:5557", replay_endpoint: Optional[str] = None, buffer_steps: int = 10_000, @@ -121,6 +138,7 @@ class ZmqEventPublisher(EventPublisher): topic: str = "", ) -> None: # Storage + super().__init__(data_parallel_rank) self._event_queue = Queue[Optional[EventBatch]](maxsize=max_queue_size) self._buffer = deque[tuple[int, bytes]](maxlen=buffer_steps) @@ -128,8 +146,11 @@ class ZmqEventPublisher(EventPublisher): self._ctx = zmq.Context.instance() self._pub: Optional[zmq.Socket] = None self._replay: Optional[zmq.Socket] = None - self._endpoint = endpoint - self._replay_endpoint = replay_endpoint + self._dp_rank = data_parallel_rank + + self._endpoint = self.offset_endpoint_port(endpoint, self._dp_rank) + self._replay_endpoint = self.offset_endpoint_port( + replay_endpoint, self._dp_rank) self._hwm = hwm self._socket_setup() @@ -149,6 +170,8 @@ class ZmqEventPublisher(EventPublisher): def publish(self, events: EventBatch) -> None: if not self._running: raise RuntimeError("Publisher is closed") + if events.data_parallel_rank is None: + events.data_parallel_rank = self._data_parallel_rank self._event_queue.put(events) def shutdown(self) -> None: @@ -191,11 +214,12 @@ class ZmqEventPublisher(EventPublisher): self._pub.set_hwm(self._hwm) # Heuristic: bind if wildcard / * present, else connect. # bind stable, connect volatile convention - if ("*" in self._endpoint or "::" in self._endpoint - or self._endpoint.startswith("ipc://") - or self._endpoint.startswith("inproc://")): + if (self._endpoint is not None + and ("*" in self._endpoint or "::" in self._endpoint + or self._endpoint.startswith("ipc://") + or self._endpoint.startswith("inproc://"))): self._pub.bind(self._endpoint) - else: + elif self._endpoint is not None: self._pub.connect(self._endpoint) # Set up replay socket: use ROUTER @@ -266,6 +290,38 @@ class ZmqEventPublisher(EventPublisher): # receiving payload is (-1, b""") self._replay.send_multipart((client_id, b"", self.END_SEQ, b"")) + @staticmethod + def offset_endpoint_port(endpoint: Optional[str], + data_parallel_rank: int) -> Optional[str]: + """Helper function to offset the port in an endpoint by + the data parallel rank. + + Args: + endpoint: The endpoint string + (e.g., "tcp://*:5557" or "inproc://cache") + data_parallel_rank: The data parallel rank to offset by + + Returns: + The endpoint with the port offset by data_parallel_rank + or suffix appended + """ + # Do nothing if input is None or data_parallel_rank is 0 + if not endpoint or data_parallel_rank == 0: + return endpoint + + if "inproc" in endpoint: + return f"{endpoint}_dp{data_parallel_rank}" + if "tcp" in endpoint: + if endpoint and ":" in endpoint: + # Get everything after the last colon (the port) + last_colon_idx = endpoint.rfind(":") + base_addr = endpoint[:last_colon_idx] + base_port = int(endpoint[last_colon_idx + 1:]) + new_port = base_port + data_parallel_rank + return f"{base_addr}:{new_port}" + return endpoint + raise ValueError("Invalid endpoint: must contain 'inproc' or 'tcp'") + class EventPublisherFactory: _registry: dict[str, Callable[..., EventPublisher]] = { @@ -281,7 +337,9 @@ class EventPublisherFactory: cls._registry[name] = ctor @classmethod - def create(cls, config: Optional[KVEventsConfig]) -> EventPublisher: + def create(cls, + config: Optional[KVEventsConfig], + data_parallel_rank: int = 0) -> EventPublisher: """Create publisher from a config mapping.""" if not config: return NullEventPublisher() @@ -294,4 +352,5 @@ class EventPublisherFactory: constructor = cls._registry[kind] except KeyError as exc: raise ValueError(f"Unknown event publisher '{kind}'") from exc - return constructor(**config_dict) + return constructor(data_parallel_rank=data_parallel_rank, + **config_dict) diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index e510a0626c1b4..32d03b311a4ed 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -80,7 +80,9 @@ class Scheduler(SchedulerInterface): config=self.vllm_config, role=KVConnectorRole.SCHEDULER) self.kv_event_publisher = EventPublisherFactory.create( - self.kv_events_config) + self.kv_events_config, + vllm_config.parallel_config.data_parallel_rank, + ) num_gpu_blocks = self.cache_config.num_gpu_blocks assert num_gpu_blocks is not None and num_gpu_blocks > 0 From abd7df2fca570998693fa8c1ae39d83fb789ef27 Mon Sep 17 00:00:00 2001 From: Jiaxin Shan Date: Tue, 3 Jun 2025 17:15:18 -0700 Subject: [PATCH 039/115] [Misc] Fix path and python alias errors in disagg_prefill exmaples (#18919) --- .../disagg_prefill_lmcache_v1/disagg_example_nixl.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh b/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh index df8a412935049..0b6c9213ebfff 100644 --- a/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh +++ b/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh @@ -33,7 +33,7 @@ check_num_gpus() { ensure_python_library_installed() { echo "Checking if $1 is installed..." - python -c "import $1" > /dev/null 2>&1 + python3 -c "import $1" > /dev/null 2>&1 if [ $? -ne 0 ]; then if [ "$1" == "nixl" ]; then echo "$1 is not installed. Please refer to https://github.com/ai-dynamo/nixl for installation." @@ -121,8 +121,8 @@ main() { echo "All servers are up. Starting benchmark..." # begin benchmark - cd ../../../benchmarks/ - python benchmark_serving.py --port 9000 --seed $(date +%s) \ + cd ../../../../benchmarks/ + python3 benchmark_serving.py --port 9000 --seed $(date +%s) \ --model meta-llama/Llama-3.1-8B-Instruct \ --dataset-name random --random-input-len 7500 --random-output-len 200 \ --num-prompts 200 --burstiness 100 --request-rate 3.6 | tee benchmark.log From 52dceb172d6fe762bb60b670df61866fe86b6f17 Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Tue, 3 Jun 2025 21:09:13 -0400 Subject: [PATCH 040/115] [Docs] Add developer doc about CI failures (#18782) Signed-off-by: Russell Bryant Co-authored-by: Mark McLoughlin Co-authored-by: Cyrus Leung --- docs/contributing/ci-failures.md | 120 +++++++++++++++++++++++++++++++ 1 file changed, 120 insertions(+) create mode 100644 docs/contributing/ci-failures.md diff --git a/docs/contributing/ci-failures.md b/docs/contributing/ci-failures.md new file mode 100644 index 0000000000000..4d8f78197f336 --- /dev/null +++ b/docs/contributing/ci-failures.md @@ -0,0 +1,120 @@ +# CI Failures + +What should I do when a CI job fails on my PR, but I don't think my PR caused +the failure? + +- Check the dashboard of current CI test failures: + 👉 [CI Failures Dashboard](https://github.com/orgs/vllm-project/projects/20) + +- If your failure **is already listed**, it's likely unrelated to your PR. + Help fixing it is always welcome! + - Leave comments with links to additional instances of the failure. + - React with a 👍 to signal how many are affected. + +- If your failure **is not listed**, you should **file an issue**. + +## Filing a CI Test Failure Issue + +- **File a bug report:** + 👉 [New CI Failure Report](https://github.com/vllm-project/vllm/issues/new?template=450-ci-failure.yml) + +- **Use this title format:** + + ``` + [CI Failure]: failing-test-job - regex/matching/failing:test + ``` + +- **For the environment field:** + + ``` + Still failing on main as of commit abcdef123 + ``` + +- **In the description, include failing tests:** + + ``` + FAILED failing/test.py:failing_test1 - Failure description + FAILED failing/test.py:failing_test2 - Failure description + https://github.com/orgs/vllm-project/projects/20 + https://github.com/vllm-project/vllm/issues/new?template=400-bug-report.yml + FAILED failing/test.py:failing_test3 - Failure description + ``` + +- **Attach logs** (collapsible section example): +
+ Logs: + + ```text + ERROR 05-20 03:26:38 [dump_input.py:68] Dumping input data + --- Logging error --- + Traceback (most recent call last): + File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 203, in execute_model + return self.model_executor.execute_model(scheduler_output) + ... + FAILED failing/test.py:failing_test1 - Failure description + FAILED failing/test.py:failing_test2 - Failure description + FAILED failing/test.py:failing_test3 - Failure description + ``` + +
+ +## Logs Wrangling + +Download the full log file from Buildkite locally. + +Strip timestamps and colorization: + +```bash +# Strip timestamps +sed -i 's/^\[[0-9]\{4\}-[0-9]\{2\}-[0-9]\{2\}T[0-9]\{2\}:[0-9]\{2\}:[0-9]\{2\}Z\] //' ci.log + +# Strip colorization +sed -i -r 's/\x1B\[[0-9;]*[mK]//g' ci.log +``` + +Use a tool for quick copy-pasting: + +```bash +tail -525 ci_build.log | wl-copy +``` + +## Investigating a CI Test Failure + +1. Go to 👉 [Buildkite main branch](https://buildkite.com/vllm/ci/builds?branch=main) +2. Bisect to find the first build that shows the issue. +3. Add your findings to the GitHub issue. +4. If you find a strong candidate PR, mention it in the issue and ping contributors. + +## Reproducing a Failure + +CI test failures may be flaky. Use a bash loop to run repeatedly: + +```bash +COUNT=1; while pytest -sv tests/v1/engine/test_engine_core_client.py::test_kv_cache_events[True-tcp]; do + COUNT=$[$COUNT + 1]; echo "RUN NUMBER ${COUNT}"; +done +``` + +## Submitting a PR + +If you submit a PR to fix a CI failure: + +- Link the PR to the issue: + Add `Closes #12345` to the PR description. +- Add the `ci-failure` label: + This helps track it in the [CI Failures GitHub Project](https://github.com/orgs/vllm-project/projects/20). + +## Other Resources + +- 🔍 [Test Reliability on `main`](https://buildkite.com/organizations/vllm/analytics/suites/ci-1/tests?branch=main&order=ASC&sort_by=reliability) +- 🧪 [Latest Buildkite CI Runs](https://buildkite.com/vllm/ci/builds?branch=main) + +## Daily Triage + +Use [Buildkite analytics (2-day view)](https://buildkite.com/organizations/vllm/analytics/suites/ci-1/tests?branch=main&period=2days) to: + +- Identify recent test failures **on `main`**. +- Exclude legitimate test failures on PRs. +- (Optional) Ignore tests with 0% reliability. + +Compare to the [CI Failures Dashboard](https://github.com/orgs/vllm-project/projects/20). From 4555143ea7fdd2b2f0106e40889bfbab49879237 Mon Sep 17 00:00:00 2001 From: "Li, Jiang" Date: Wed, 4 Jun 2025 09:43:01 +0800 Subject: [PATCH 041/115] [CPU] V1 support for the CPU backend (#16441) --- .../scripts/hardware_ci/run-cpu-test.sh | 13 +- docs/usage/v1_guide.md | 2 + requirements/cpu.txt | 3 + .../attention/test_attention_selector.py | 5 +- .../models/language/generation/test_common.py | 1 - vllm/attention/backends/cpu_mla.py | 6 +- vllm/attention/backends/torch_sdpa.py | 16 +- vllm/compilation/wrapper.py | 7 +- vllm/engine/arg_utils.py | 4 +- vllm/platforms/cpu.py | 67 +++++-- vllm/v1/attention/backends/cpu_attn.py | 163 ++++++++++++++++++ vllm/v1/worker/cpu_model_runner.py | 86 +++++++++ vllm/v1/worker/cpu_worker.py | 101 +++++++++++ vllm/v1/worker/gpu_model_runner.py | 28 +-- vllm/v1/worker/gpu_worker.py | 3 +- 15 files changed, 465 insertions(+), 40 deletions(-) create mode 100644 vllm/v1/attention/backends/cpu_attn.py create mode 100644 vllm/v1/worker/cpu_model_runner.py create mode 100644 vllm/v1/worker/cpu_worker.py diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh index 0a11935607e2a..61aa7df13b4d5 100644 --- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh @@ -6,6 +6,7 @@ set -ex # allow to bind to different cores CORE_RANGE=${CORE_RANGE:-48-95} +OMP_CORE_RANGE=${OMP_CORE_RANGE:-48-95} NUMA_NODE=${NUMA_NODE:-1} export CMAKE_BUILD_PARALLEL_LEVEL=32 @@ -23,10 +24,8 @@ numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu . # Run the image, setting --shm-size=4g for tensor parallel. -docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \ - --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE" -docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \ - --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2 +docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE" +docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2 function cpu_tests() { set -e @@ -56,7 +55,7 @@ function cpu_tests() { # Run AWQ test docker exec cpu-test-"$NUMA_NODE" bash -c " set -e - pytest -s -v \ + VLLM_USE_V1=0 pytest -s -v \ tests/quantization/test_ipex_quant.py" # Run chunked-prefill and prefix-cache test @@ -68,8 +67,6 @@ function cpu_tests() { # online serving docker exec cpu-test-"$NUMA_NODE" bash -c " set -e - export VLLM_CPU_KVCACHE_SPACE=10 - export VLLM_CPU_OMP_THREADS_BIND=$1 python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half & timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1 python3 benchmarks/benchmark_serving.py \ @@ -89,4 +86,4 @@ function cpu_tests() { # All of CPU tests are expected to be finished less than 40 mins. export -f cpu_tests -timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE" +timeout 1h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE" diff --git a/docs/usage/v1_guide.md b/docs/usage/v1_guide.md index a2321bf98900b..7c4909cb5d913 100644 --- a/docs/usage/v1_guide.md +++ b/docs/usage/v1_guide.md @@ -40,6 +40,8 @@ This living user guide outlines a few known **important changes and limitations* | **NVIDIA** | 🚀 Natively Supported | | **AMD** | 🚧 WIP | | **TPU** | 🚧 WIP | +| **CPU** | 🚧 WIP | + #### Feature / Model | Feature / Model | Status | diff --git a/requirements/cpu.txt b/requirements/cpu.txt index 1213301584ce3..e43b443977524 100644 --- a/requirements/cpu.txt +++ b/requirements/cpu.txt @@ -1,6 +1,9 @@ # Common dependencies -r common.txt +numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding +numba == 0.61.2; python_version > '3.9' + # Dependencies for CPUs packaging>=24.2 setuptools>=77.0.3,<80.0.0 diff --git a/tests/kernels/attention/test_attention_selector.py b/tests/kernels/attention/test_attention_selector.py index 435fe62256140..f3e64155703c2 100644 --- a/tests/kernels/attention/test_attention_selector.py +++ b/tests/kernels/attention/test_attention_selector.py @@ -85,7 +85,10 @@ def test_env( CpuPlatform()): backend = get_attn_backend(16, torch.float16, torch.float16, block_size, False) - assert backend.get_name() == "TORCH_SDPA" + if use_v1: + assert backend.get_name() == "TORCH_SDPA_VLLM_V1" + else: + assert backend.get_name() == "TORCH_SDPA" elif device == "hip": with patch("vllm.attention.selector.current_platform", diff --git a/tests/models/language/generation/test_common.py b/tests/models/language/generation/test_common.py index ed9e547225149..f656f90c4bd37 100644 --- a/tests/models/language/generation/test_common.py +++ b/tests/models/language/generation/test_common.py @@ -87,7 +87,6 @@ AITER_MODEL_LIST = [ pytest.param("bigcode/starcoder2-3b"), # starcoder2 pytest.param( "TitanML/tiny-mixtral", # mixtral - marks=[pytest.mark.cpu_model], ) ]) @pytest.mark.parametrize("max_tokens", [32]) diff --git a/vllm/attention/backends/cpu_mla.py b/vllm/attention/backends/cpu_mla.py index cf7883e121abb..793cb87b74342 100644 --- a/vllm/attention/backends/cpu_mla.py +++ b/vllm/attention/backends/cpu_mla.py @@ -178,7 +178,7 @@ class CPUMLAMetadataBuilder(AttentionMetadataBuilder[CPUMLAMetadata]): seq_lens_tensor=seq_lens_tensor, max_query_len=max_query_len, max_kv_len=max_kv_len, - query_start_loc=query_start_loc, + prefill_query_start_loc=query_start_loc, kv_start_loc=kv_start_loc, max_decode_seq_len=input_data.max_decode_seq_len, num_prefills=input_data.num_prefills, @@ -264,8 +264,8 @@ class CPUMLAImpl(MLACommonImpl[CPUMLAMetadata]): key=k, value=v_padded, out=output, - seqlen_q=prefill_metadata.query_start_loc, - seqlen_k=prefill_metadata.query_start_loc, + seqlen_q=prefill_metadata.prefill_query_start_loc, + seqlen_k=prefill_metadata.prefill_query_start_loc, max_seqlen_q=prefill_metadata.max_query_len, max_seqlen_k=prefill_metadata.max_query_len, pdropout=0.0, diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py index f3fb5adcf05ce..23231c323f139 100644 --- a/vllm/attention/backends/torch_sdpa.py +++ b/vllm/attention/backends/torch_sdpa.py @@ -87,10 +87,13 @@ class TorchSDPAMetadata(AttentionMetadata, PagedAttentionMetadata): # For chunked prefill only max_query_len: Optional[int] = None max_kv_len: Optional[int] = None - query_start_loc: Optional[torch.Tensor] = None + prefill_query_start_loc: Optional[torch.Tensor] = None kv_start_loc: Optional[torch.Tensor] = None prefill_block_tables: Optional[torch.Tensor] = None + # For V1 logits index only + query_start_loc: Optional[torch.Tensor] = None + # Begin encoder attn & enc/dec cross-attn fields... # Encoder sequence lengths representation encoder_seq_lens: Optional[List[int]] = None @@ -375,7 +378,7 @@ class TorchSDPAMetadataBuilder(AttentionMetadataBuilder[TorchSDPAMetadata]): seq_lens_tensor=seq_lens_tensor, max_query_len=max_query_len, max_kv_len=max_kv_len, - query_start_loc=query_start_loc, + prefill_query_start_loc=query_start_loc, kv_start_loc=kv_start_loc, max_decode_seq_len=input_data.max_decode_seq_len, num_prefills=input_data.num_prefills, @@ -470,6 +473,11 @@ class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]): Returns: shape = [num_tokens, num_heads * head_size] """ + + # For warming-up + if attn_metadata is None: + return query + attn_type = self.attn_type if (attn_type == AttentionType.ENCODER and (not attn_metadata.is_all_encoder_attn_metadata_set)): @@ -537,8 +545,8 @@ class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]): output = torch.empty_like(query) if prefill_meta := attn_metadata.prefill_metadata: - assert attn_metadata.seq_lens is not None if not prefill_meta.prefill_metadata.chunked_prefill: # type: ignore + assert attn_metadata.seq_lens is not None self._run_sdpa_forward(output, query, key, @@ -555,7 +563,7 @@ class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]): query[:prefill_meta.num_prefill_tokens, :, :], key_cache, value_cache, - prefill_meta.query_start_loc, + prefill_meta.prefill_query_start_loc, prefill_meta.kv_start_loc, prefill_meta.max_query_len, prefill_meta.max_kv_len, diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py index 8c8d0b5cb2291..2a261c84c3fc3 100644 --- a/vllm/compilation/wrapper.py +++ b/vllm/compilation/wrapper.py @@ -41,11 +41,16 @@ class TorchCompileWrapperWithCustomDispatcher: # compiling the forward method backend = vllm_config.compilation_config.init_backend(vllm_config) + options = None + if isinstance(backend, str) and backend == "inductor": + options = get_current_vllm_config( + ).compilation_config.inductor_compile_config compiled_callable = torch.compile( self.forward, fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE, - backend=backend) + backend=backend, + options=options) self.compiled_callable = compiled_callable self.original_code_object = self.__class__.forward.__code__ diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 2197d44ca8259..b1c4b27a0ca4e 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1399,6 +1399,7 @@ class EngineArgs: "FLASHINFER", "FLASHINFER_VLLM_V1", "ROCM_AITER_MLA", + "TORCH_SDPA_VLLM_V1", ] if (envs.is_set("VLLM_ATTENTION_BACKEND") and envs.VLLM_ATTENTION_BACKEND not in V1_BACKENDS): @@ -1431,7 +1432,8 @@ class EngineArgs: # Non-[CUDA, TPU] may be supported on V1, but off by default for now. v0_hardware = not any( - (current_platform.is_cuda(), current_platform.is_tpu())) + (current_platform.is_cuda(), current_platform.is_tpu(), + current_platform.is_cpu())) if v0_hardware and _warn_or_fallback( # noqa: SIM103 current_platform.device_name): return False diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index 2739f5c8c6900..265959d626e0d 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -57,7 +57,10 @@ class CpuPlatform(Platform): logger.info("Using CPU MLA backend.") return "vllm.attention.backends.cpu_mla.CPUMLABackend" logger.info("Using Torch SDPA backend.") - return "vllm.attention.backends.torch_sdpa.TorchSDPABackend" + if use_v1: + return "vllm.v1.attention.backends.cpu_attn.TorchSDPABackend" + else: + return "vllm.attention.backends.torch_sdpa.TorchSDPABackend" @classmethod def get_device_total_memory(cls, device_id: int = 0) -> int: @@ -81,6 +84,8 @@ class CpuPlatform(Platform): if not model_config.enforce_eager: model_config.enforce_eager = True + model_config.disable_cascade_attn = True + cache_config = vllm_config.cache_config ipex_available = find_spec("intel_extension_for_pytorch") is not None @@ -128,7 +133,8 @@ class CpuPlatform(Platform): f" {kv_cache_space}, expect a positive integer value.") parallel_config = vllm_config.parallel_config - if (parallel_config.distributed_executor_backend is not None + if (parallel_config.world_size > 1 + and parallel_config.distributed_executor_backend is not None and parallel_config.distributed_executor_backend != "mp"): logger.warning(("%s is not supported on CPU, fallback to mp " "distributed executor backend."), @@ -141,7 +147,38 @@ class CpuPlatform(Platform): parallel_config.sd_worker_cls = \ "vllm.worker.cpu_worker.CPUWorker" else: - parallel_config.worker_cls = "vllm.worker.cpu_worker.CPUWorker" + if envs.VLLM_USE_V1: + parallel_config.worker_cls = \ + "vllm.v1.worker.cpu_worker.CPUWorker" + else: + parallel_config.worker_cls = \ + "vllm.worker.cpu_worker.CPUWorker" + + # Note: workaround for v1 gpu_model_runner + from vllm.config import CompilationLevel + vllm_config.compilation_config.cudagraph_capture_sizes = [] + + compilation_config = vllm_config.compilation_config + if (envs.VLLM_USE_V1 and vllm_config.compilation_config.level + == CompilationLevel.PIECEWISE): + compilation_config.level = CompilationLevel.DYNAMO_ONCE + compilation_config.backend = "eager" + compilation_config.custom_ops += ["none"] + compilation_config.inductor_compile_config.update({ + "dce": + True, + "size_asserts": + False, + "nan_asserts": + False, + "memory_planning": + True, + "epilogue_fusion": + True, + }) + + if vllm_config.lora_config is not None: + compilation_config.level = CompilationLevel.NO_COMPILATION assert vllm_config.device_config.device_type == "cpu" @@ -149,6 +186,12 @@ class CpuPlatform(Platform): # Environment variables for CPU executor # + os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + + # Note: to avoid the error 'nthreads cannot be larger than environment + # variable "NUMEXPR_MAX_THREADS" (64)'. + os.environ["NUMEXPR_MAX_THREADS"] = str(len(os.sched_getaffinity(0))) + # Set default threads num for OpenMP parallel os.environ["OMP_NUM_THREADS"] = str(torch.get_num_threads()) @@ -171,13 +214,6 @@ class CpuPlatform(Platform): # To hint IPEX uses shared memory based AllReduce os.environ["LOCAL_WORLD_SIZE"] = str( vllm_config.parallel_config.tensor_parallel_size) - if sys.platform == "darwin" and \ - envs.VLLM_WORKER_MULTIPROC_METHOD == "fork": - if os.environ.get('VLLM_WORKER_MULTIPROC_METHOD', None) is None: - logger.warning( - "Default to spawn method on MacOS. If this is not desired," - " set VLLM_WORKER_MULTIPROC_METHOD to fork explicitly.") - os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn' if vllm_config.model_config and vllm_config.model_config.use_mla: logger.info( @@ -204,3 +240,14 @@ class CpuPlatform(Platform): Get device specific communicator class for distributed communication. """ return "vllm.distributed.device_communicators.cpu_communicator.CpuCommunicator" # noqa + + @classmethod + def supports_structured_output(cls) -> bool: + return True + + @classmethod + def supports_v1(cls, model_config) -> bool: + """Returns whether the current platform can support v1 for the supplied + model configuration. + """ + return True diff --git a/vllm/v1/attention/backends/cpu_attn.py b/vllm/v1/attention/backends/cpu_attn.py new file mode 100644 index 0000000000000..d7a580c2883c3 --- /dev/null +++ b/vllm/v1/attention/backends/cpu_attn.py @@ -0,0 +1,163 @@ +# SPDX-License-Identifier: Apache-2.0 +import numpy as np +import torch + +from vllm.attention.backends.abstract import AttentionMetadata +from vllm.attention.backends.torch_sdpa import (TorchSDPABackendImpl, + TorchSDPAMetadata) +from vllm.attention.backends.utils import CommonAttentionState +from vllm.attention.ops.ipex_attn import PagedAttention +from vllm.v1.attention.backends.utils import CommonAttentionMetadata +from vllm.v1.core.sched.output import SchedulerOutput +from vllm.v1.kv_cache_interface import AttentionSpec +from vllm.v1.worker.block_table import BlockTable +from vllm.v1.worker.cpu_model_runner import CPUModelRunner +from vllm.v1.worker.gpu_input_batch import InputBatch + + +class TorchSDPABackend: + accept_output_buffer: bool = False + + @staticmethod + def get_name() -> str: + return "TORCH_SDPA_VLLM_V1" + + @staticmethod + def get_impl_cls() -> type["TorchSDPABackendImpl"]: + return TorchSDPABackendImpl + + @staticmethod + def get_metadata_cls() -> type["AttentionMetadata"]: + return TorchSDPAMetadata + + @staticmethod + def get_state_cls() -> type["CommonAttentionState"]: + return CommonAttentionState + + @staticmethod + def get_builder_cls() -> type["TorchSDPAMetadataBuilderV1"]: + return TorchSDPAMetadataBuilderV1 + + @staticmethod + def get_kv_cache_shape( + num_blocks: int, + block_size: int, + num_kv_heads: int, + head_size: int, + ) -> tuple[int, ...]: + return PagedAttention.get_kv_cache_shape(num_blocks, block_size, + num_kv_heads, head_size) + + @staticmethod + def use_cascade_attention(*args, **kwargs) -> bool: + return False + + +class TorchSDPAMetadataBuilderV1: + + def __init__(self, runner: CPUModelRunner, kv_cache_spec: AttentionSpec, + block_table: BlockTable) -> None: + self.runner = runner + self.block_table = block_table + + # For reorder + self.reorder_prompt_req_index_list = np.empty(self.runner.max_num_reqs, + dtype=np.int64) + self.reorder_decode_req_index_list = np.empty(self.runner.max_num_reqs, + dtype=np.int64) + self.num_prompt_req: int = 0 + + self.seq_start_loc_cpu = torch.zeros( + runner.max_num_reqs + 1, + dtype=torch.int32, + device="cpu", + ) + self.seq_start_loc_np = self.seq_start_loc_cpu.numpy() + + def reorder_batch(self, input_batch: InputBatch, + scheduler_output: SchedulerOutput) -> bool: + prompt_list_idx = 0 + decode_list_idx = 0 + for req_index in range(input_batch.num_reqs): + if input_batch.num_computed_tokens_cpu[ + req_index] < input_batch.num_prompt_tokens[req_index]: + # prompt stage + self.reorder_prompt_req_index_list[prompt_list_idx] = req_index + prompt_list_idx += 1 + else: + # decode stage + self.reorder_decode_req_index_list[decode_list_idx] = req_index + decode_list_idx += 1 + assert decode_list_idx + prompt_list_idx == input_batch.num_reqs + + # Update prompt requests number + self.num_prompt_req = prompt_list_idx + + reorder_req_num = 0 + for req_index in range(decode_list_idx): + if self.reorder_decode_req_index_list[req_index] < prompt_list_idx: + reorder_req_num += 1 + else: + break + + if reorder_req_num == 0: + return False + + reorder_prompt_list = ( + self.reorder_prompt_req_index_list[:prompt_list_idx] + [-reorder_req_num:]) + reorder_decode_list = ( + self.reorder_decode_req_index_list[:decode_list_idx] + [:reorder_req_num]) + assert reorder_decode_list.size == reorder_prompt_list.size + + for idx in range(reorder_req_num): + prompt_req_index = reorder_prompt_list[idx].item() + decode_req_index = reorder_decode_list[idx].item() + input_batch.swap_states(prompt_req_index, decode_req_index) + + return True + + def build(self, num_reqs: int, num_actual_tokens: int, max_query_len: int, + common_prefix_len: int, + common_attn_metadata: CommonAttentionMetadata): + runner = self.runner + block_table = self.block_table + seq_lens_np = runner.seq_lens_np[:num_reqs] + num_prompt_req = self.num_prompt_req + max_prefill_seq_len = seq_lens_np[:num_prompt_req].max().item( + ) if num_prompt_req > 0 else 0 + max_decode_seq_len = seq_lens_np[num_prompt_req:num_reqs].max().item( + ) if num_prompt_req < num_reqs else 0 + self.seq_start_loc_np[0] = 0 + np.cumsum(seq_lens_np, out=self.seq_start_loc_np[1:num_reqs + 1]) + num_prefill_tokens = runner.query_start_loc_np[num_prompt_req].item() + num_decode_tokens = runner.query_start_loc_np[num_reqs].item( + ) - num_prefill_tokens + slot_mapping = block_table.slot_mapping_cpu[:num_actual_tokens].long() + block_table_tensor = block_table.get_device_tensor() + attn_metadata = TorchSDPAMetadata( + num_prefills=num_prompt_req, + num_prefill_tokens=num_prefill_tokens, + num_decode_tokens=num_decode_tokens, + slot_mapping=slot_mapping, + seq_lens_tensor=runner. + seq_lens_cpu[num_prompt_req:num_reqs], # decode + max_decode_seq_len=max_decode_seq_len, # decode + block_tables=block_table_tensor[num_prompt_req:num_reqs], # decode + chunked_prefill=True, + max_query_len=max_query_len, + max_kv_len=max_prefill_seq_len, + prefill_query_start_loc=runner. + query_start_loc_cpu[:num_prompt_req + 1], # prefill + kv_start_loc=self.seq_start_loc_cpu[:num_prompt_req + + 1], # prefill + prefill_block_tables=block_table_tensor[: + num_prompt_req], # prefill + query_start_loc=runner.query_start_loc_cpu[:num_reqs + + 1], # for logits index + multi_modal_placeholder_index_maps=None, + enable_kv_scales_calculation=False, + ) + + return attn_metadata diff --git a/vllm/v1/worker/cpu_model_runner.py b/vllm/v1/worker/cpu_model_runner.py new file mode 100644 index 0000000000000..607cfc0ef69cd --- /dev/null +++ b/vllm/v1/worker/cpu_model_runner.py @@ -0,0 +1,86 @@ +# SPDX-License-Identifier: Apache-2.0 +from contextlib import contextmanager +from typing import Any + +import torch + +from vllm.config import VllmConfig +from vllm.logger import init_logger +from vllm.model_executor.model_loader import get_model +from vllm.v1.worker.gpu_model_runner import GPUModelRunner + +logger = init_logger(__name__) + + +class CPUModelRunner(GPUModelRunner): + + def __init__(self, vllm_config: VllmConfig, device: torch.device): + super().__init__(vllm_config, device) + + assert device == torch.device("cpu") + assert self.speculative_config is None, "spec decode is not supported." + + self.use_cuda_graph = False + self.cascade_attn_enabled = False + + self._postprocess_tenosrs() + + def _postprocess_tenosrs(self) -> None: + # Note: replace device tensors with cpu tensors + def replace_tensor(obj: Any, cpu_attr_name: str, + device_attr_name) -> None: + cpu_tensor = getattr(obj, cpu_attr_name, None) + device_tensor = getattr(obj, device_attr_name, None) + if cpu_tensor is not None and device_tensor is not None: + assert isinstance(cpu_tensor, torch.Tensor) + assert isinstance(device_tensor, torch.Tensor) + setattr(obj, device_attr_name, cpu_tensor) + + for k, v in vars(self).items(): + if k.endswith("_cpu") and isinstance(v, torch.Tensor): + replace_tensor(self, k, k[:-4]) + + for k, v in vars(self.input_batch).items(): + if k.endswith("_cpu_tensor") and isinstance(v, torch.Tensor): + replace_tensor(self.input_batch, k, k[:-11]) + + for k, v in vars(self.input_batch.block_table).items(): + if k.endswith("_cpu") and isinstance(v, torch.Tensor): + replace_tensor(self.input_batch.block_table, k, k[:-4]) + + def load_model(self) -> None: + logger.info("Starting to load model %s...", self.model_config.model) + self.model = get_model(vllm_config=self.vllm_config) + + if self.lora_config: + self.model = self.load_lora_model(self.model, self.model_config, + self.scheduler_config, + self.lora_config, self.device) + + def warming_up_model(self) -> None: + logger.info("Warming up model for the compilation...") + # Only generate graph for the generic shape + self._dummy_run(max(16, self.max_num_reqs)) + logger.info("Warming up done.") + + def _init_device_properties(self) -> None: + pass + + def _sync_device(self) -> None: + pass + + +@contextmanager +def _set_global_compilation_settings(): + import torch._inductor.config + + # Note: The CPPGEMM backend requires freezing parameters. + freezing_value = torch._inductor.config.freezing + torch._inductor.config.freezing = True + # Note: workaround for "ValueError: fast mode: can't pickle cyclic objects + # including object type dict" + force_disable_caches = torch._inductor.config.force_disable_caches + torch._inductor.config.force_disable_caches = True + yield + torch._inductor.config.freezing = freezing_value + torch._inductor.config.force_disable_caches = force_disable_caches diff --git a/vllm/v1/worker/cpu_worker.py b/vllm/v1/worker/cpu_worker.py new file mode 100644 index 0000000000000..0b710b7bc203f --- /dev/null +++ b/vllm/v1/worker/cpu_worker.py @@ -0,0 +1,101 @@ +# SPDX-License-Identifier: Apache-2.0 +import os +from typing import Optional + +import torch + +from vllm import envs +from vllm.config import VllmConfig +from vllm.distributed.parallel_state import get_pp_group, get_tp_group +from vllm.logger import init_logger +from vllm.model_executor.utils import set_random_seed +from vllm.sequence import IntermediateTensors +from vllm.v1.core.sched.output import SchedulerOutput +from vllm.v1.outputs import ModelRunnerOutput +from vllm.v1.worker.cpu_model_runner import CPUModelRunner +from vllm.v1.worker.gpu_worker import (Worker, + init_worker_distributed_environment) + +logger = init_logger(__name__) + + +class CPUWorker(Worker): + + def __init__(self, + vllm_config: VllmConfig, + local_rank: int, + rank: int, + distributed_init_method: str, + is_driver_worker: bool = False): + super().__init__(vllm_config, + local_rank, + rank, + distributed_init_method, + is_driver_worker=is_driver_worker) + + self.parallel_config.disable_custom_all_reduce = True + + def init_device(self): + # Setup OpenMP threads affinity. + omp_cpuids = envs.VLLM_CPU_OMP_THREADS_BIND + if omp_cpuids == "all": + self.local_omp_cpuid = "all" + else: + self.local_omp_cpuid = omp_cpuids.split("|")[self.rank] + ret = torch.ops._C_utils.init_cpu_threads_env(self.local_omp_cpuid) + if ret: + logger.info(ret) + + # Note: unique identifier for creating allreduce shared memory + os.environ["VLLM_DIST_IDENT"] = self.distributed_init_method.split( + ":")[-1] + # Initialize the distributed environment. + init_worker_distributed_environment(self.vllm_config, self.rank, + self.distributed_init_method, + self.local_rank, "gloo") + # Set random seed. + set_random_seed(self.model_config.seed) + + # Construct the model runner + self.model_runner: CPUModelRunner = CPUModelRunner( + self.vllm_config, torch.device("cpu")) + + def sleep(self, level: int = 1) -> None: + logger.warning("sleep mode is not supported on CPU, ignore it.") + pass + + def wake_up(self, tags: Optional[list[str]] = None) -> None: + logger.warning("sleep mode is not supported on CPU, ignore it.") + pass + + def determine_available_memory(self) -> int: + return self.cache_config.cpu_kvcache_space_bytes # type: ignore + + def compile_or_warm_up_model(self) -> None: + # Reset the seed to ensure that the random state is not affected by + # the model initialization and profiling. + set_random_seed(self.model_config.seed) + self.model_runner.warming_up_model() + + @torch.inference_mode() + def execute_model( + self, + scheduler_output: "SchedulerOutput", + ) -> Optional[ModelRunnerOutput]: + intermediate_tensors = None + if not get_pp_group().is_first_rank: + intermediate_tensors = IntermediateTensors( + get_pp_group().recv_tensor_dict( + all_gather_group=get_tp_group())) + + output = self.model_runner.execute_model(scheduler_output, + intermediate_tensors) + + if not get_pp_group().is_last_rank: + assert isinstance(output, IntermediateTensors) + get_pp_group().send_tensor_dict(output.tensors, + all_gather_group=get_tp_group()) + return None + + assert isinstance(output, ModelRunnerOutput) + return output if self.is_driver_worker else None diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 6a566a602b190..6ea6bb020ed7f 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -5,7 +5,7 @@ import copy import gc import time import weakref -from typing import TYPE_CHECKING, Optional, Union +from typing import TYPE_CHECKING, Any, Optional, Union import numpy as np import torch @@ -38,7 +38,6 @@ from vllm.sequence import IntermediateTensors from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, GiB_bytes, LazyLoader, async_tensor_h2d, cdiv, check_use_alibi, is_pin_memory_available) -from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata from vllm.v1.attention.backends.utils import CommonAttentionMetadata from vllm.v1.core.encoder_cache_manager import compute_encoder_budget from vllm.v1.kv_cache_interface import (AttentionSpec, FullAttentionSpec, @@ -203,8 +202,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): self.vllm_config.compilation_config.cudagraph_capture_sizes)) # Cache the device properties. - self.device_properties = torch.cuda.get_device_properties(self.device) - self.num_sms = self.device_properties.multi_processor_count + self._init_device_properties() # Persistent buffers for CUDA graphs. self.input_ids = torch.zeros(self.max_num_tokens, @@ -315,6 +313,17 @@ class GPUModelRunner(LoRAModelRunnerMixin): self.input_batch, scheduler_output) return batch_reordered + # Note: used for model runner override. + def _init_device_properties(self) -> None: + """Initialize attributes from torch.cuda.get_device_properties + """ + self.device_properties = torch.cuda.get_device_properties(self.device) + self.num_sms = self.device_properties.multi_processor_count + + # Note: used for model runner override. + def _sync_device(self) -> None: + torch.cuda.synchronize() + def _update_states(self, scheduler_output: "SchedulerOutput") -> None: """Update the cached states and the persistent batch with the scheduler output. @@ -538,8 +547,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): def _prepare_inputs( self, scheduler_output: "SchedulerOutput", - ) -> tuple[dict[str, FlashAttentionMetadata], torch.Tensor, - Optional[SpecDecodeMetadata]]: + ) -> tuple[dict[str, Any], torch.Tensor, Optional[SpecDecodeMetadata]]: total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens assert total_num_scheduled_tokens > 0 num_reqs = self.input_batch.num_reqs @@ -652,7 +660,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): common_attn_metadata = CommonAttentionMetadata( query_start_loc=query_start_loc, seq_lens=seq_lens) - attn_metadata: dict[str, FlashAttentionMetadata] = {} + attn_metadata: dict[str, Any] = {} # Prepare the attention metadata for each KV cache group and make layers # in the same group share the same metadata. for kv_cache_group_id, kv_cache_group_spec in enumerate( @@ -1710,7 +1718,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): # Must synchronize the non-blocking GPU->CPU transfers. if prompt_logprobs_dict: - torch.cuda.synchronize() + self._sync_device() return prompt_logprobs_dict @@ -1740,7 +1748,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): dtype=np.int32) if skip_attn: - attn_metadata: Optional[dict[str, FlashAttentionMetadata]] = None + attn_metadata: Optional[dict[str, Any]] = None else: query_start_loc = self.query_start_loc[:num_reqs + 1] seq_lens = self.seq_lens[:num_reqs] @@ -1964,7 +1972,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): sampler_output = self._dummy_sampler_run(hidden_states) else: sampler_output = None - torch.cuda.synchronize() + self._sync_device() del hidden_states, sampler_output self.encoder_cache.clear() gc.collect() diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index f36cf5d5c3191..3bf3b2221a447 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -342,13 +342,14 @@ def init_worker_distributed_environment( rank: int, distributed_init_method: Optional[str] = None, local_rank: int = -1, + backend: str = "nccl", ) -> None: """Initialize the distributed environment.""" parallel_config = vllm_config.parallel_config set_custom_all_reduce(not parallel_config.disable_custom_all_reduce) init_distributed_environment(parallel_config.world_size, rank, - distributed_init_method, local_rank) + distributed_init_method, local_rank, backend) ensure_model_parallel_initialized(parallel_config.tensor_parallel_size, parallel_config.pipeline_parallel_size) From 1409ef913446aa282f6426efbb0ed02a59320467 Mon Sep 17 00:00:00 2001 From: Lukas Geiger Date: Wed, 4 Jun 2025 04:24:56 +0100 Subject: [PATCH 042/115] [Core] Cast multimodal input in hf processor (#18862) Signed-off-by: Lukas Geiger --- vllm/inputs/registry.py | 26 +++++++++++++++++-- vllm/multimodal/inputs.py | 8 +----- vllm/spec_decode/draft_model_runner.py | 1 - vllm/v1/worker/gpu_model_runner.py | 2 -- vllm/v1/worker/tpu_model_runner.py | 2 -- vllm/worker/cpu_enc_dec_model_runner.py | 1 - vllm/worker/cpu_model_runner.py | 1 - vllm/worker/cpu_pooling_model_runner.py | 1 - vllm/worker/enc_dec_model_runner.py | 1 - vllm/worker/model_runner.py | 1 - vllm/worker/multi_step_neuron_model_runner.py | 1 - ...i_step_neuronx_distributed_model_runner.py | 1 - vllm/worker/neuron_model_runner.py | 2 -- vllm/worker/pooling_model_runner.py | 1 - vllm/worker/xpu_model_runner.py | 1 - 15 files changed, 25 insertions(+), 25 deletions(-) diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index 73d19aecde6c5..3dad021e31668 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -4,9 +4,12 @@ from collections.abc import Mapping from dataclasses import dataclass from typing import TYPE_CHECKING, Any, NamedTuple, Optional, Union +import torch from transformers import BatchFeature, PretrainedConfig, ProcessorMixin from typing_extensions import TypeVar +from vllm.jsontree import JSONTree, json_map_leaves +from vllm.logger import init_logger from vllm.transformers_utils.processor import cached_processor_from_config from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.utils import resolve_mm_processor_kwargs @@ -21,6 +24,8 @@ _T = TypeVar("_T") _C = TypeVar("_C", bound=PretrainedConfig, default=PretrainedConfig) _P = TypeVar("_P", bound=ProcessorMixin, default=ProcessorMixin) +logger = init_logger(__name__) + @dataclass(frozen=True) class InputContext: @@ -134,7 +139,7 @@ class InputProcessingContext(InputContext): hf_processor: ProcessorMixin, data: Mapping[str, object], kwargs: Mapping[str, object] = {}, - ) -> BatchFeature: + ) -> Union[BatchFeature, JSONTree]: """ Call `hf_processor` on the prompt `data` (text, image, audio...) with configurable options `kwargs`. @@ -154,8 +159,25 @@ class InputProcessingContext(InputContext): allow_var_kwargs=True, ) + def maybe_cast_dtype(x): + # This mimics the behavior of transformers.BatchFeature + if isinstance(x, torch.Tensor) and x.is_floating_point(): + return x.to(dtype=self.model_config.dtype) + return x + try: - return hf_processor(**data, **merged_kwargs, return_tensors="pt") + output = hf_processor(**data, **merged_kwargs, return_tensors="pt") + # this emulates output.to(dtype=self.model_config.dtype) + cast_output = json_map_leaves(maybe_cast_dtype, output) + if isinstance(output, BatchFeature): + return BatchFeature(cast_output) + + logger.warning_once( + f"{type(hf_processor).__name__} did not return `BatchFeature`. " + "Make sure to match the behaviour of `ProcessorMixin` when " + "implementing custom processors.") + return cast_output + except Exception as exc: msg = (f"Failed to apply {type(hf_processor).__name__} " f"on data={data} with kwargs={merged_kwargs}") diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index 35d2a6e8c74ff..0bf5b1cf1c6c7 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -747,17 +747,11 @@ class MultiModalKwargs(UserDict[str, NestedTensors]): batched_inputs: BatchedTensorInputs, *, device: torch.types.Device, - dtype: Optional[torch.dtype] = None, ) -> BatchedTensorInputs: json_inputs = cast(JSONTree[torch.Tensor], batched_inputs) - def maybe_cast_dtype(x: torch.Tensor): - # This mimics the behavior of transformers.BatchFeature - return x.to(dtype=dtype) if x.is_floating_point() else x - json_mapped = json_map_leaves( - # NOTE: Cast the dtype before sending it to device - lambda x: maybe_cast_dtype(x).to(device=device, non_blocking=True), + lambda x: x.to(device=device, non_blocking=True), json_inputs, ) diff --git a/vllm/spec_decode/draft_model_runner.py b/vllm/spec_decode/draft_model_runner.py index 8ccfefea1acbd..96646ec947186 100644 --- a/vllm/spec_decode/draft_model_runner.py +++ b/vllm/spec_decode/draft_model_runner.py @@ -297,7 +297,6 @@ class TP1DraftModelRunner(ModelRunnerWrapperBase): intermediate_tensors=intermediate_tensors, **MultiModalKwargs.as_kwargs( multi_modal_kwargs, - dtype=self.model_runner.model_config.dtype, device=self.device, ), **model_execute_kwargs, diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 6ea6bb020ed7f..9ac33a1499610 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -957,7 +957,6 @@ class GPUModelRunner(LoRAModelRunnerMixin): batched_mm_inputs = MultiModalKwargs.batch(grouped_mm_inputs) batched_mm_inputs = MultiModalKwargs.as_kwargs( batched_mm_inputs, - dtype=self.model_config.dtype, device=self.device, ) @@ -1951,7 +1950,6 @@ class GPUModelRunner(LoRAModelRunnerMixin): [dummy_mm_kwargs] * max_num_mm_items) batched_dummy_mm_inputs = MultiModalKwargs.as_kwargs( batched_dummy_mm_inputs, - dtype=self.model_config.dtype, device=self.device, ) diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 73c445d14e38e..94e438fb44ec1 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -718,7 +718,6 @@ class TPUModelRunner(LoRAModelRunnerMixin): batched_mm_inputs = MultiModalKwargs.batch(grouped_mm_inputs) batched_mm_inputs = MultiModalKwargs.as_kwargs( batched_mm_inputs, - dtype=self.model_config.dtype, device=self.device, ) @@ -1560,7 +1559,6 @@ class TPUModelRunner(LoRAModelRunnerMixin): batch_size) return MultiModalKwargs.as_kwargs( batched_dummy_mm_inputs, - dtype=self.model_config.dtype, device=self.device, ) diff --git a/vllm/worker/cpu_enc_dec_model_runner.py b/vllm/worker/cpu_enc_dec_model_runner.py index 677d66357a7fa..c99e2652a3972 100644 --- a/vllm/worker/cpu_enc_dec_model_runner.py +++ b/vllm/worker/cpu_enc_dec_model_runner.py @@ -300,7 +300,6 @@ class CPUEncoderDecoderModelRunner( model_input.encoder_input_positions, **MultiModalKwargs.as_kwargs( model_input.multi_modal_kwargs or {}, - dtype=self.model_config.dtype, device=self.device, ), "intermediate_tensors": diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py index 6213cf760ac55..68cdf65cafa79 100644 --- a/vllm/worker/cpu_model_runner.py +++ b/vllm/worker/cpu_model_runner.py @@ -630,7 +630,6 @@ class CPUModelRunner(CPUModelRunnerBase[ModelInputForCPUWithSamplingMetadata]): if model_input.multi_modal_kwargs is not None: multimodal_kwargs = MultiModalKwargs.as_kwargs( model_input.multi_modal_kwargs, - dtype=self.model_config.dtype, device=self.device, ) execute_model_kwargs = {} diff --git a/vllm/worker/cpu_pooling_model_runner.py b/vllm/worker/cpu_pooling_model_runner.py index 174f86f48b568..203fdf225a41a 100644 --- a/vllm/worker/cpu_pooling_model_runner.py +++ b/vllm/worker/cpu_pooling_model_runner.py @@ -53,7 +53,6 @@ class CPUPoolingModelRunner( model_input.input_positions, **MultiModalKwargs.as_kwargs( model_input.multi_modal_kwargs or {}, - dtype=self.model_config.dtype, device=self.device, ), **cross_enc_kwargs, diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py index a3e7b0147961c..8d92edc5b386e 100644 --- a/vllm/worker/enc_dec_model_runner.py +++ b/vllm/worker/enc_dec_model_runner.py @@ -205,7 +205,6 @@ class EncoderDecoderModelRunner(GPUModelRunnerBase[EncoderDecoderModelInput]): intermediate_tensors=intermediate_tensors, **MultiModalKwargs.as_kwargs( multi_modal_kwargs, - dtype=self.model_config.dtype, device=self.device, ), **seqlen_agnostic_kwargs, diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 75501e0f748ab..82db6617ba55f 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -1848,7 +1848,6 @@ class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]): intermediate_tensors=intermediate_tensors, **MultiModalKwargs.as_kwargs( multi_modal_kwargs, - dtype=self.model_config.dtype, device=self.device, ), **seqlen_agnostic_kwargs, diff --git a/vllm/worker/multi_step_neuron_model_runner.py b/vllm/worker/multi_step_neuron_model_runner.py index 336e41649df58..25f588077cb42 100644 --- a/vllm/worker/multi_step_neuron_model_runner.py +++ b/vllm/worker/multi_step_neuron_model_runner.py @@ -73,7 +73,6 @@ class MultiStepNeuronModelRunner(NeuronModelRunner): input_block_ids=model_input.input_block_ids, **MultiModalKwargs.as_kwargs( model_input.multi_modal_kwargs or {}, - dtype=self.model_config.dtype, device=self.device, ), ) diff --git a/vllm/worker/multi_step_neuronx_distributed_model_runner.py b/vllm/worker/multi_step_neuronx_distributed_model_runner.py index de9827723eecf..dd521dd67dad0 100644 --- a/vllm/worker/multi_step_neuronx_distributed_model_runner.py +++ b/vllm/worker/multi_step_neuronx_distributed_model_runner.py @@ -52,7 +52,6 @@ class MultiStepNeuronxDistributedModelRunner(NeuronxDistributedModelRunner): sampling_params=sampling_params, **MultiModalKwargs.as_kwargs( model_input.multi_modal_kwargs or {}, - dtype=self.model_config.dtype, device=self.device, ), ) diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py index 28855bb4698bc..7ccf1a2c0a876 100644 --- a/vllm/worker/neuron_model_runner.py +++ b/vllm/worker/neuron_model_runner.py @@ -395,7 +395,6 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]): adapter_ids=model_input.adapter_ids, **MultiModalKwargs.as_kwargs( model_input.multi_modal_kwargs or {}, - dtype=self.model_config.dtype, device=self.device, ), ) @@ -408,7 +407,6 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]): input_block_ids=model_input.input_block_ids, **MultiModalKwargs.as_kwargs( model_input.multi_modal_kwargs or {}, - dtype=self.model_config.dtype, device=self.device, ), ) diff --git a/vllm/worker/pooling_model_runner.py b/vllm/worker/pooling_model_runner.py index be6b3d1379fdc..f80955f71a5a3 100644 --- a/vllm/worker/pooling_model_runner.py +++ b/vllm/worker/pooling_model_runner.py @@ -122,7 +122,6 @@ class PoolingModelRunner( intermediate_tensors=intermediate_tensors, **MultiModalKwargs.as_kwargs( multi_modal_kwargs, - dtype=self.model_config.dtype, device=self.device, ), **cross_enc_kwargs, diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py index ecbb63d912766..b2d3ce8526d51 100644 --- a/vllm/worker/xpu_model_runner.py +++ b/vllm/worker/xpu_model_runner.py @@ -565,7 +565,6 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): intermediate_tensors=intermediate_tensors, **MultiModalKwargs.as_kwargs( model_input.multi_modal_kwargs or {}, - dtype=self.model_config.dtype, device=self.device, ), ) From 5d6d1adf15aca59cb135853d0f11308af4bbd6e3 Mon Sep 17 00:00:00 2001 From: Vadim Gimpelson <156319763+vadiklyutiy@users.noreply.github.com> Date: Wed, 4 Jun 2025 08:13:01 +0400 Subject: [PATCH 043/115] [KERNEL] Sampler. CUDA kernel for applying repetition penalty (#18437) --- CMakeLists.txt | 1 + csrc/ops.h | 5 ++ csrc/sampler.cu | 86 +++++++++++++++++++ csrc/torch_bindings.cpp | 7 ++ .../test_apply_repetition_penalties.py | 76 ++++++++++++++++ vllm/_custom_ops.py | 39 +++++++++ vllm/model_executor/layers/utils.py | 13 +-- 7 files changed, 218 insertions(+), 9 deletions(-) create mode 100644 csrc/sampler.cu create mode 100644 tests/kernels/test_apply_repetition_penalties.py diff --git a/CMakeLists.txt b/CMakeLists.txt index 87aa23c080f50..f11d28590b284 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -242,6 +242,7 @@ set(VLLM_EXT_SRC "csrc/activation_kernels.cu" "csrc/layernorm_kernels.cu" "csrc/layernorm_quant_kernels.cu" + "csrc/sampler.cu" "csrc/cuda_view.cu" "csrc/quantization/gptq/q_gemm.cu" "csrc/quantization/compressed_tensors/int8_quant_kernels.cu" diff --git a/csrc/ops.h b/csrc/ops.h index 7044b4588b81f..297f32b4a2a06 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -92,6 +92,11 @@ void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight, void fused_add_rms_norm(torch::Tensor& input, torch::Tensor& residual, torch::Tensor& weight, double epsilon); +void apply_repetition_penalties_(torch::Tensor& logits, + const torch::Tensor& prompt_mask, + const torch::Tensor& output_mask, + const torch::Tensor& repetition_penalties); + void rms_norm_static_fp8_quant(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight, torch::Tensor& scale, double epsilon); diff --git a/csrc/sampler.cu b/csrc/sampler.cu new file mode 100644 index 0000000000000..ee5793dda0ef8 --- /dev/null +++ b/csrc/sampler.cu @@ -0,0 +1,86 @@ +#include "dispatch_utils.h" + +#include +#include + +#ifndef USE_ROCM + #include +#else + #include +#endif + +namespace vllm { + +template +__global__ void apply_repetition_penalties_kernel( + scalar_t* __restrict__ logits, // [num_seqs, vocab_size] + const bool* __restrict__ prompt_mask, // [num_seqs, vocab_size] + const bool* __restrict__ output_mask, // [num_seqs, vocab_size] + const scalar_t* __restrict__ repetition_penalties, // [num_seqs] + const int num_seqs, const int vocab_size, const int tile_size) { + // Each block handles one sequence and a tile of vocab + const int seq_idx = blockIdx.x; + if (seq_idx >= num_seqs) return; + + const int tile_start = blockIdx.y * tile_size; + const int tile_end = min(tile_start + tile_size, vocab_size); + + // Load repetition penalty for this sequence + const scalar_t penalty = repetition_penalties[seq_idx]; + + // Each thread processes multiple vocab items within the tile + for (int vocab_idx = tile_start + threadIdx.x; vocab_idx < tile_end; + vocab_idx += blockDim.x) { + const int64_t idx = static_cast(seq_idx) * vocab_size + vocab_idx; + const bool is_repeated = prompt_mask[idx] || output_mask[idx]; + if (is_repeated) { + scalar_t logit = logits[idx]; + if (logit > 0) { + logits[idx] = logit / penalty; + } else { + logits[idx] = logit * penalty; + } + } + } +} + +} // namespace vllm + +void apply_repetition_penalties_( + torch::Tensor& logits, // [num_seqs, vocab_size], in-place + const torch::Tensor& prompt_mask, // [num_seqs, vocab_size] + const torch::Tensor& output_mask, // [num_seqs, vocab_size] + const torch::Tensor& repetition_penalties) { // [num_seqs] + TORCH_CHECK(logits.is_contiguous()); + TORCH_CHECK(prompt_mask.is_contiguous()); + TORCH_CHECK(output_mask.is_contiguous()); + TORCH_CHECK(repetition_penalties.is_contiguous()); + + int vocab_size = logits.size(-1); + int num_seqs = logits.size(0); + + // Get number of SMs on the current device + int sms = 0; + cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, + logits.get_device()); + + // Compute tile_num and tile_size + int tile_num = + std::min(vocab_size, std::max(1, (sms + num_seqs - 1) / num_seqs)); + int tile_size = (vocab_size + tile_num - 1) / tile_num; + + // Each block handles one sequence and a tile of vocab + dim3 grid(num_seqs, tile_num); + dim3 block(std::min(tile_size, 1024)); + const at::cuda::OptionalCUDAGuard device_guard(device_of(logits)); + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + VLLM_DISPATCH_FLOATING_TYPES( + logits.scalar_type(), "apply_repetition_penalties_kernel", [&] { + vllm::apply_repetition_penalties_kernel + <<>>( + logits.data_ptr(), prompt_mask.data_ptr(), + output_mask.data_ptr(), + repetition_penalties.data_ptr(), num_seqs, vocab_size, + tile_size); + }); +} \ No newline at end of file diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index 371894c56a79b..3fffaf290ad34 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -170,6 +170,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { "float epsilon) -> ()"); ops.impl("fused_add_rms_norm", torch::kCUDA, &fused_add_rms_norm); + // Apply repetition penalties to logits in-place + ops.def( + "apply_repetition_penalties_(Tensor! logits, Tensor prompt_mask, " + "Tensor output_mask, Tensor repetition_penalties) -> ()"); + ops.impl("apply_repetition_penalties_", torch::kCUDA, + &apply_repetition_penalties_); + // Layernorm-quant // Apply Root Mean Square (RMS) Normalization to the input tensor. ops.def( diff --git a/tests/kernels/test_apply_repetition_penalties.py b/tests/kernels/test_apply_repetition_penalties.py new file mode 100644 index 0000000000000..9115949a16514 --- /dev/null +++ b/tests/kernels/test_apply_repetition_penalties.py @@ -0,0 +1,76 @@ +# SPDX-License-Identifier: Apache-2.0 +import pytest +import torch + +from tests.kernels.utils import opcheck +from vllm._custom_ops import (apply_repetition_penalties_cuda, + apply_repetition_penalties_torch) +from vllm.platforms import current_platform + +NUM_SEQS = [1, 2, 3, 4, 8, 13, 17, 32, 37, 256, 1023, 1024, 1025] +# [stress, stress, stress, Qwen, llama 4] +VOCAB_SIZES = [17, 256, 1019, 151936, 202048] +REPETITION_PENALTY_VALUES = [1.05] +SEEDS = [0] +DTYPES = [torch.float32, torch.float16] + + +@pytest.mark.parametrize("num_seqs", NUM_SEQS) +@pytest.mark.parametrize("vocab_size", VOCAB_SIZES) +@pytest.mark.parametrize("repetition_penalty", REPETITION_PENALTY_VALUES) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("seed", SEEDS) +@pytest.mark.skipif(not current_platform.is_cuda(), + reason="This test for checking CUDA kernel") +@torch.inference_mode() +def test_apply_repetition_penalties( + num_seqs: int, + vocab_size: int, + repetition_penalty: float, + dtype: torch.dtype, + seed: int, +) -> None: + """ + Test the apply_repetition_penalties custom op + against a reference implementation. + """ + current_platform.seed_everything(seed) + torch.set_default_device("cuda:0") + + # Create test data + logits = torch.randn(num_seqs, vocab_size, dtype=dtype) + + # Create masks with some random tokens marked as repeated + prompt_mask = torch.zeros(num_seqs, vocab_size, dtype=torch.bool) + output_mask = torch.zeros(num_seqs, vocab_size, dtype=torch.bool) + + # Mark some tokens as repeated in prompt and output + prompt_indices = torch.randint(0, vocab_size, + (num_seqs, max(1, vocab_size // 200))) + output_indices = torch.randint(0, vocab_size, + (num_seqs, max(1, vocab_size // 200))) + + for i in range(num_seqs): + prompt_mask[i, prompt_indices[i]] = True + output_mask[i, output_indices[i]] = True + + # Create repetition penalties tensor + repetition_penalties = torch.full((num_seqs, ), + repetition_penalty, + dtype=dtype) + + # Run all three implementations + logits_torch = logits.clone() + logits_cuda = logits.clone() + + apply_repetition_penalties_torch(logits_torch, prompt_mask, output_mask, + repetition_penalties) + apply_repetition_penalties_cuda(logits_cuda, prompt_mask, output_mask, + repetition_penalties) + + # Compare all outputs to reference + torch.testing.assert_close(logits_torch, logits_cuda, rtol=1e-3, atol=1e-3) + + # Test the operator by applying the opcheck utility + opcheck(torch.ops._C.apply_repetition_penalties_, + (logits.clone(), prompt_mask, output_mask, repetition_penalties)) diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 008a7aa94939b..3282edf410b6e 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -282,6 +282,45 @@ def fused_add_rms_norm(input: torch.Tensor, residual: torch.Tensor, torch.ops._C.fused_add_rms_norm(input, residual, weight, epsilon) +def apply_repetition_penalties_torch( + logits: torch.Tensor, prompt_mask: torch.Tensor, + output_mask: torch.Tensor, repetition_penalties: torch.Tensor) -> None: + repetition_penalties = repetition_penalties.unsqueeze(dim=1).repeat( + 1, logits.size(1)) + # If token appears in prompt or output, apply, otherwise use 1.0 for no-op. + penalties = torch.where(prompt_mask | output_mask, repetition_penalties, + 1.0) + # If logits are positive, divide by penalty, otherwise multiply by penalty. + scaling = torch.where(logits > 0, 1.0 / penalties, penalties) + logits *= scaling + + +def apply_repetition_penalties_cuda( + logits: torch.Tensor, prompt_mask: torch.Tensor, + output_mask: torch.Tensor, repetition_penalties: torch.Tensor) -> None: + torch.ops._C.apply_repetition_penalties_(logits, prompt_mask, output_mask, + repetition_penalties) + + +def apply_repetition_penalties(logits: torch.Tensor, prompt_mask: torch.Tensor, + output_mask: torch.Tensor, + repetition_penalties: torch.Tensor) -> None: + """Apply repetition penalties to logits in-place. + + Args: + logits: The logits tensor of shape [num_seqs, vocab_size]. + prompt_mask: A boolean tensor indicating which tokens appear in the prompt. + output_mask: A boolean tensor indicating which tokens appear in the output. + repetition_penalties: The repetition penalties of shape (num_seqs, ). + """ + if current_platform.is_cuda() and logits.is_contiguous(): + apply_repetition_penalties_cuda(logits, prompt_mask, output_mask, + repetition_penalties) + else: + apply_repetition_penalties_torch(logits, prompt_mask, output_mask, + repetition_penalties) + + def advance_step_flashattn(num_seqs: int, num_queries: int, block_size: int, input_tokens: torch.Tensor, sampled_token_ids: torch.Tensor, diff --git a/vllm/model_executor/layers/utils.py b/vllm/model_executor/layers/utils.py index d97d842386972..41b5253dca048 100644 --- a/vllm/model_executor/layers/utils.py +++ b/vllm/model_executor/layers/utils.py @@ -50,16 +50,11 @@ def apply_penalties(logits: torch.Tensor, prompt_tokens_tensor: torch.Tensor, vocab_size, num_seqs) output_bin_counts, output_mask = get_token_bin_counts_and_mask( output_tokens_tensor, vocab_size, num_seqs) - repetition_penalties = repetition_penalties.unsqueeze(dim=1).repeat( - 1, vocab_size) - # If token appears in prompt or output, apply, otherwise use 1.0 for no-op. - penalties = torch.where(prompt_mask | output_mask, repetition_penalties, - 1.0) - - # If logits are positive, divide by penalty, otherwise multiply by penalty. - scaling = torch.where(logits > 0, 1.0 / penalties, penalties) - logits *= scaling + # Apply repetition penalties as a custom op + from vllm._custom_ops import apply_repetition_penalties + apply_repetition_penalties(logits, prompt_mask, output_mask, + repetition_penalties) # We follow the definition in OpenAI API. # Refer to https://platform.openai.com/docs/api-reference/parameter-details From 8d646c2e53d3d840a3442bdd00845a6b57eb666f Mon Sep 17 00:00:00 2001 From: Calvin Chen <45745657+calvin0327@users.noreply.github.com> Date: Wed, 4 Jun 2025 12:23:26 +0800 Subject: [PATCH 044/115] [Cleanup][v1]:remote guided-decoding-backend for example (#19059) Signed-off-by: calvin chen <120380290@qq.com> --- .../online_serving/openai_chat_completion_structured_outputs.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/online_serving/openai_chat_completion_structured_outputs.py b/examples/online_serving/openai_chat_completion_structured_outputs.py index 64379083dcca8..5c55d53138a8f 100644 --- a/examples/online_serving/openai_chat_completion_structured_outputs.py +++ b/examples/online_serving/openai_chat_completion_structured_outputs.py @@ -139,7 +139,6 @@ def extra_backend_options_completion(client: OpenAI, model: str): extra_body={ "guided_regex": r"\w+@\w+\.com\n", "stop": ["\n"], - "guided_decoding_backend": "xgrammar", "guided_decoding_disable_fallback": True, }, ) From 41aa5784287f00b026f3ba225ac18ab3caccc622 Mon Sep 17 00:00:00 2001 From: Kaixi Hou Date: Wed, 4 Jun 2025 12:40:26 +0800 Subject: [PATCH 045/115] [NVIDIA] Add Cutlass MLA backend (#17625) --- csrc/attention/mla/cutlass_mla_kernels.cu | 2 +- tests/kernels/test_cutlass_mla_decode.py | 4 +- vllm/engine/arg_utils.py | 1 + vllm/platforms/cuda.py | 8 ++ vllm/platforms/interface.py | 1 + vllm/v1/attention/backends/mla/common.py | 2 +- vllm/v1/attention/backends/mla/cutlass_mla.py | 96 +++++++++++++++++++ 7 files changed, 111 insertions(+), 3 deletions(-) create mode 100644 vllm/v1/attention/backends/mla/cutlass_mla.py diff --git a/csrc/attention/mla/cutlass_mla_kernels.cu b/csrc/attention/mla/cutlass_mla_kernels.cu index 6743af0cf2dba..f4b6b19f4b232 100644 --- a/csrc/attention/mla/cutlass_mla_kernels.cu +++ b/csrc/attention/mla/cutlass_mla_kernels.cu @@ -119,7 +119,7 @@ typename T::Fmha::Arguments args_from_options( {static_cast(out.data_ptr()), stride_O, static_cast(nullptr), stride_LSE}, hw_info, - -1, // split_kv + 1, // split_kv nullptr, // is_var_split_kv }; // TODO(kaixih@nvidia): When split_kv=-1 and is_var_split_kv=false, we compute diff --git a/tests/kernels/test_cutlass_mla_decode.py b/tests/kernels/test_cutlass_mla_decode.py index c56024b757e14..2b745b84dae6c 100644 --- a/tests/kernels/test_cutlass_mla_decode.py +++ b/tests/kernels/test_cutlass_mla_decode.py @@ -76,7 +76,9 @@ def test_cutlass_mla_decode(dtype: torch.dtype, mean_seq_len: int, bs: int, pack_factor = 128 // block_size block_num = ((block_num + pack_factor - 1) // pack_factor) * pack_factor - q = torch.randn(bs, h_q, d) + # Amplify input values to ensure test coverage of edge cases where CUTLASS + # kernel errors occur with split_k settings. + q = torch.randn(bs, h_q, d) * 100 block_table = torch.randint(0, bs * block_num, (bs, block_num), dtype=torch.int32) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index b1c4b27a0ca4e..90134683180a7 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1395,6 +1395,7 @@ class EngineArgs: "PALLAS_VLLM_V1", "TRITON_ATTN_VLLM_V1", "TRITON_MLA", + "CUTLASS_MLA_VLLM_V1", "FLASHMLA", "FLASHINFER", "FLASHINFER_VLLM_V1", diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 07ae470fabfb8..bde606f0c1ef7 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -183,6 +183,14 @@ class CudaPlatformBase(Platform): if use_mla: # TODO(lucas): refactor to be more concise # we should probably consider factoring out V1 here + if selected_backend == _Backend.CUTLASS_MLA_VLLM_V1: + if use_v1: + logger.info_once("Using Cutlass MLA backend on V1 engine.") + return ("vllm.v1.attention.backends.mla." + "cutlass_mla.CutlassMLABackend") + else: + logger.warning( + "Cutlass MLA backend is only supported on V1 engine") if selected_backend == _Backend.TRITON_MLA or block_size != 64: if use_v1: logger.info_once("Using Triton MLA backend on V1 engine.") diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index 1ec9c78a361af..7fef697d8f014 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -51,6 +51,7 @@ class _Backend(enum.Enum): TRITON_MLA_VLLM_V1 = enum.auto() FLASHMLA_VLLM_V1 = enum.auto() FLASHMLA = enum.auto() # Supported by V1 + CUTLASS_MLA_VLLM_V1 = enum.auto() HPU_ATTN = enum.auto() PALLAS = enum.auto() PALLAS_VLLM_V1 = enum.auto() diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index 06acbb909a4f6..e6b4f6404632c 100644 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -350,7 +350,7 @@ class MLACommonMetadataBuilder(Generic[M]): self.num_heads = model_config.get_num_attention_heads( runner.parallel_config) self.mla_dims = get_mla_dims(model_config) - self.aot_schedule = is_vllm_fa and (get_flash_attn_version() == 3) + self.aot_schedule = current_platform.is_cuda() self.kv_cache_spec = kv_cache_spec # Dont try to access the runner on AMD diff --git a/vllm/v1/attention/backends/mla/cutlass_mla.py b/vllm/v1/attention/backends/mla/cutlass_mla.py new file mode 100644 index 0000000000000..70aee058e2963 --- /dev/null +++ b/vllm/v1/attention/backends/mla/cutlass_mla.py @@ -0,0 +1,96 @@ +# SPDX-License-Identifier: Apache-2.0 + +from typing import Any, Optional + +import torch + +import vllm._custom_ops as ops +from vllm.attention.backends.abstract import (AttentionType, + is_quantized_kv_cache) +from vllm.logger import init_logger +from vllm.v1.attention.backends.mla.common import (MLACommonBackend, + MLACommonImpl, + MLACommonMetadata) + +logger = init_logger(__name__) + + +class CutlassMLABackend(MLACommonBackend): + + @staticmethod + def get_name() -> str: + return "CUTLASS_MLA_VLLM_V1" + + @staticmethod + def get_impl_cls() -> type["CutlassMLAImpl"]: + return CutlassMLAImpl + + +class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]): + + def __init__( + self, + num_heads: int, + head_size: int, + scale: float, + num_kv_heads: int, + alibi_slopes: Optional[list[float]], + sliding_window: Optional[int], + kv_cache_dtype: str, + blocksparse_params: Optional[dict[str, Any]], + logits_soft_cap: Optional[float], + attn_type: str, + # MLA Specific Arguments + **mla_args) -> None: + super().__init__(num_heads, head_size, scale, num_kv_heads, + alibi_slopes, sliding_window, kv_cache_dtype, + blocksparse_params, logits_soft_cap, attn_type, + **mla_args) + + unsupported_features = [ + alibi_slopes, sliding_window, blocksparse_params, logits_soft_cap + ] + if any(unsupported_features): + raise NotImplementedError( + "CutlassMLAImpl does not support one of the following: " + "alibi_slopes, sliding_window, blocksparse_params, " + "logits_soft_cap") + + if attn_type != AttentionType.DECODER: + raise NotImplementedError("Encoder self-attention and " + "encoder/decoder cross-attention " + "are not implemented for " + "CutlassMLAImpl") + + if is_quantized_kv_cache(self.kv_cache_dtype): + raise NotImplementedError( + "CutlassMLA V1 with FP8 KV cache not yet supported") + + def _forward_decode( + self, + q_nope: torch.Tensor, + q_pe: torch.Tensor, + kv_c_and_k_pe_cache: torch.Tensor, + attn_metadata: MLACommonMetadata, + ) -> torch.Tensor: + assert kv_c_and_k_pe_cache.numel() > 0 + assert attn_metadata.decode is not None + + if self.kv_cache_dtype.startswith("fp8"): + raise NotImplementedError("FP8 Cutlass MLA not yet supported") + + B = q_nope.shape[0] + + o = torch.empty((B, self.num_heads, self.kv_lora_rank), + dtype=q_nope.dtype, + device=q_nope.device) + + # Run MLA + # Clone q_nope and q_pe to make sure strides computation is correct. + q_nope = q_nope.clone() + q_pe = q_pe.clone() + ops.cutlass_mla_decode(o, q_nope, q_pe, kv_c_and_k_pe_cache, + attn_metadata.decode.seq_lens, + attn_metadata.decode.block_table, self.scale) + + return self._v_up_proj(o) From b124e1085b1bf977e3dac96d99ffd9d8ddfdb6cc Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Tue, 3 Jun 2025 23:10:15 -0700 Subject: [PATCH 046/115] [Bugfix] Fix FA3 full cuda graph correctness (#19106) Signed-off-by: Woosuk Kwon --- .buildkite/test-pipeline.yaml | 1 + .../compile/piecewise/test_full_cudagraph.py | 7 +++-- vllm/v1/attention/backends/flash_attn.py | 29 ++++++++++++++----- vllm/v1/worker/gpu_model_runner.py | 5 ++++ 4 files changed, 32 insertions(+), 10 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 8ab96b3b7ac3c..4ee6b499b5396 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -320,6 +320,7 @@ steps: # these tests need to be separated, cannot combine - pytest -v -s compile/piecewise/test_simple.py - pytest -v -s compile/piecewise/test_toy_llama.py + - pytest -v -s compile/piecewise/test_full_cudagraph.py - label: PyTorch Fullgraph Test # 18min mirror_hardwares: [amdexperimental, amdproduction] diff --git a/tests/compile/piecewise/test_full_cudagraph.py b/tests/compile/piecewise/test_full_cudagraph.py index 3188ea40f9ee6..134bade486079 100644 --- a/tests/compile/piecewise/test_full_cudagraph.py +++ b/tests/compile/piecewise/test_full_cudagraph.py @@ -7,6 +7,7 @@ import pytest from vllm import LLM, SamplingParams from vllm.config import CompilationConfig +from vllm.platforms import current_platform MODEL = "Qwen/Qwen2-1.5B-Instruct" @@ -37,7 +38,7 @@ def full_cudagraph_llm(): "VLLM_FLASH_ATTN_VERSION": "3" }): return LLM(model=MODEL, - gpu_memory_utilization=0.2, + gpu_memory_utilization=0.3, compilation_config=CompilationConfig(full_cuda_graph=True)) @@ -48,7 +49,7 @@ def piecewise_llm(): "VLLM_FLASH_ATTN_VERSION": "3" }): return LLM(model=MODEL, - gpu_memory_utilization=0.5, + gpu_memory_utilization=0.6, compilation_config=CompilationConfig()) @@ -61,6 +62,8 @@ def generate_text(llm: LLM, batch_size: int, max_tokens: int): return llm.generate(prompts, sampling_params) +@pytest.mark.skipif(current_platform.get_device_capability() != (9, 0), + reason="Only Hopper GPUs support FlashAttention 3") @pytest.mark.parametrize(("batch_size", "max_tokens"), [(1, 10), (7, 10), (16, 10), (25, 10), (32, 10), (45, 10), diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index a92c51883af1c..a9f748d026f4b 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -307,13 +307,14 @@ class FlashAttentionMetadataBuilder: self.kv_cache_spec = kv_cache_spec self.block_table = block_table - if get_flash_attn_version() == 3: - self.aot_schedule = not compilation_config.full_cuda_graph - if not self.aot_schedule: - logger.warning( - "AOT Schedule is disabled when using full_cuda_graph") - else: - self.aot_schedule = False + self.aot_schedule = (get_flash_attn_version() == 3) + self.use_full_cuda_graph = compilation_config.full_cuda_graph + if self.use_full_cuda_graph and not self.aot_schedule: + raise ValueError("Full CUDA graph mode requires AOT scheduling, " + "which requires FlashAttention 3.") + self.scheduler_metadata = torch.zeros(self.runner.max_num_reqs + 1, + dtype=torch.int32, + device=self.runner.device) # Sliding window size to be used with the AOT scheduler will be # populated on first build() call. @@ -326,7 +327,7 @@ class FlashAttentionMetadataBuilder: def build(self, num_reqs: int, num_actual_tokens: int, max_query_len: int, common_prefix_len: int, common_attn_metadata: CommonAttentionMetadata): - max_seq_len = self.runner.seq_lens_np[:num_reqs].max() + max_seq_len = int(self.runner.seq_lens_np[:num_reqs].max()) query_start_loc = common_attn_metadata.query_start_loc seq_lens = common_attn_metadata.seq_lens block_table = self.block_table @@ -448,6 +449,18 @@ class FlashAttentionMetadataBuilder: max_seq_len=max_seq_len, causal=True) + if self.use_full_cuda_graph: + assert scheduler_metadata is not None + n = scheduler_metadata.shape[0] + self.scheduler_metadata[:n].copy_(scheduler_metadata, + non_blocking=True) + # NOTE(woosuk): We should zero out the rest of the scheduler + # metadata to guarantee the correctness. Otherwise, some thread + # blocks may use the invalid scheduler metadata and overwrite the + # output buffer. + self.scheduler_metadata[n:] = 0 + scheduler_metadata = self.scheduler_metadata[:n] + attn_metadata = FlashAttentionMetadata( num_actual_tokens=num_actual_tokens, max_query_len=max_query_len, diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 9ac33a1499610..4a67e37781bf6 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1750,6 +1750,11 @@ class GPUModelRunner(LoRAModelRunnerMixin): attn_metadata: Optional[dict[str, Any]] = None else: query_start_loc = self.query_start_loc[:num_reqs + 1] + # Make sure max_model_len is used at the graph capture time. + self.seq_lens_np[:num_reqs] = self.max_model_len + self.seq_lens_np[num_reqs:] = 0 + self.seq_lens[:num_reqs].copy_(self.seq_lens_cpu[:num_reqs], + non_blocking=True) seq_lens = self.seq_lens[:num_reqs] common_attn_metadata = CommonAttentionMetadata( From 3336c8cfbef6c7d6688ca1e5b0b26424baef02c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B1=AA=E5=BF=97=E9=B9=8F?= Date: Wed, 4 Jun 2025 16:42:06 +0800 Subject: [PATCH 047/115] Fix #19130 (#19132) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 汪志鹏 --- .../vision_language_multi_image.py | 36 +++++++++++++------ 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py index de6365c0d8581..ea7a793d026b4 100644 --- a/examples/offline_inference/vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -593,21 +593,21 @@ def load_qwen_vl_chat(question: str, image_urls: list[str]) -> ModelRequestData: def load_qwen2_vl(question: str, image_urls: list[str]) -> ModelRequestData: try: - from qwen_vl_utils import process_vision_info + from qwen_vl_utils import smart_resize except ModuleNotFoundError: print( "WARNING: `qwen-vl-utils` not installed, input images will not " "be automatically resized. You can enable this functionality by " "`pip install qwen-vl-utils`." ) - process_vision_info = None + smart_resize = None model_name = "Qwen/Qwen2-VL-7B-Instruct" # Tested on L40 engine_args = EngineArgs( model=model_name, - max_model_len=32768 if process_vision_info is None else 4096, + max_model_len=32768 if smart_resize is None else 4096, max_num_seqs=5, limit_mm_per_prompt={"image": len(image_urls)}, ) @@ -630,10 +630,18 @@ def load_qwen2_vl(question: str, image_urls: list[str]) -> ModelRequestData: messages, tokenize=False, add_generation_prompt=True ) - if process_vision_info is None: + if smart_resize is None: image_data = [fetch_image(url) for url in image_urls] else: - image_data, _ = process_vision_info(messages) + + def post_process_image(image: Image) -> Image: + width, height = image.size + resized_height, resized_width = smart_resize( + height, width, max_pixels=1024 * 28 * 28 + ) + return image.resize((resized_width, resized_height)) + + image_data = [post_process_image(fetch_image(url)) for url in image_urls] return ModelRequestData( engine_args=engine_args, @@ -644,20 +652,20 @@ def load_qwen2_vl(question: str, image_urls: list[str]) -> ModelRequestData: def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData: try: - from qwen_vl_utils import process_vision_info + from qwen_vl_utils import smart_resize except ModuleNotFoundError: print( "WARNING: `qwen-vl-utils` not installed, input images will not " "be automatically resized. You can enable this functionality by " "`pip install qwen-vl-utils`." ) - process_vision_info = None + smart_resize = None model_name = "Qwen/Qwen2.5-VL-3B-Instruct" engine_args = EngineArgs( model=model_name, - max_model_len=32768 if process_vision_info is None else 4096, + max_model_len=32768 if smart_resize is None else 4096, max_num_seqs=5, limit_mm_per_prompt={"image": len(image_urls)}, ) @@ -680,10 +688,18 @@ def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData: messages, tokenize=False, add_generation_prompt=True ) - if process_vision_info is None: + if smart_resize is None: image_data = [fetch_image(url) for url in image_urls] else: - image_data, _ = process_vision_info(messages, return_video_kwargs=False) + + def post_process_image(image: Image) -> Image: + width, height = image.size + resized_height, resized_width = smart_resize( + height, width, max_pixels=1024 * 28 * 28 + ) + return image.resize((resized_width, resized_height)) + + image_data = [post_process_image(fetch_image(url)) for url in image_urls] return ModelRequestData( engine_args=engine_args, From 8e972d9c44cc8a6b1d0a3596c41604c56a492977 Mon Sep 17 00:00:00 2001 From: Siyuan Liu Date: Wed, 4 Jun 2025 01:43:00 -0700 Subject: [PATCH 048/115] [TPU] Skip hanging tests (#19115) Signed-off-by: Siyuan Liu --- .buildkite/scripts/hardware_ci/run-tpu-v1-test.sh | 2 +- tests/v1/tpu/test_spmd_model_weight_loading.py | 13 ++++++++----- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh index 3212b660ec356..a394046d2c8fe 100755 --- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh +++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh @@ -150,7 +150,7 @@ run_and_track_test 9 "test_multimodal.py" \ run_and_track_test 10 "test_pallas.py" \ "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py" run_and_track_test 11 "test_struct_output_generate.py" \ - "python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py" + "python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k 'not test_structured_output_with_reasoning_matrices'" run_and_track_test 12 "test_moe_pallas.py" \ "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py" run_and_track_test 13 "test_lora.py" \ diff --git a/tests/v1/tpu/test_spmd_model_weight_loading.py b/tests/v1/tpu/test_spmd_model_weight_loading.py index d36edfc3fb618..916325e41b922 100644 --- a/tests/v1/tpu/test_spmd_model_weight_loading.py +++ b/tests/v1/tpu/test_spmd_model_weight_loading.py @@ -45,11 +45,14 @@ def _get_spmd_mesh(): return MESH -@pytest.mark.parametrize("model", [ - "Qwen/Qwen2-1.5B-Instruct", - "meta-llama/Llama-3.1-8B-Instruct", - "meta-llama/Llama-3.1-70B-Instruct", -]) +@pytest.mark.parametrize( + "model", + [ + "Qwen/Qwen2-1.5B-Instruct", + # Skip large models due to CI runner disk space limitations + # "meta-llama/Llama-3.1-8B-Instruct", + # "meta-llama/Llama-3.1-70B-Instruct", + ]) def test_tpu_model_loader(model): # Skip the 70B test if there are less than 8 chips # TODO: Query using torch xla API, the query API is not working From 2669a0d7b518371bb1d950425bd64a320010733f Mon Sep 17 00:00:00 2001 From: Seiji Eicher <58963096+eicherseiji@users.noreply.github.com> Date: Wed, 4 Jun 2025 02:10:45 -0700 Subject: [PATCH 049/115] Fix ValueError: Missing value for tag key(s): model_name,engine. (#19113) Signed-off-by: Seiji Eicher --- tests/v1/metrics/test_ray_metrics.py | 5 ++++- vllm/v1/metrics/ray_wrappers.py | 10 ++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/tests/v1/metrics/test_ray_metrics.py b/tests/v1/metrics/test_ray_metrics.py index ea54038a2c775..0898ae65e7cd3 100644 --- a/tests/v1/metrics/test_ray_metrics.py +++ b/tests/v1/metrics/test_ray_metrics.py @@ -47,12 +47,15 @@ def test_engine_log_metrics_ray( engine_args, stat_loggers=[RayPrometheusStatLogger]) for i, prompt in enumerate(example_prompts): - engine.generate( + results = engine.generate( request_id=f"request-id-{i}", prompt=prompt, sampling_params=SamplingParams(max_tokens=max_tokens), ) + async for _ in results: + pass + # Create the actor and call the async method actor = EngineTestActor.remote() # type: ignore[attr-defined] ray.get(actor.run.remote()) diff --git a/vllm/v1/metrics/ray_wrappers.py b/vllm/v1/metrics/ray_wrappers.py index 18c8dcf0a0d35..cce692d6c09e7 100644 --- a/vllm/v1/metrics/ray_wrappers.py +++ b/vllm/v1/metrics/ray_wrappers.py @@ -31,6 +31,16 @@ class RayPrometheusMetric: self.metric.set_default_tags(labelskwargs) + if labels: + if len(labels) != len(self.metric._tag_keys): + raise ValueError( + "Number of labels must match the number of tag keys. " + f"Expected {len(self.metric._tag_keys)}, got {len(labels)}" + ) + + self.metric.set_default_tags( + dict(zip(self.metric._tag_keys, labels))) + return self From 8711bc5e684d43a333c0c20bef575a0d8ee8346f Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Wed, 4 Jun 2025 19:18:48 +0800 Subject: [PATCH 050/115] [Misc] Add packages for benchmark as extra dependency (#19089) Signed-off-by: Isotr0py <2037008807@qq.com> --- docs/cli/README.md | 2 ++ setup.py | 1 + vllm/benchmarks/datasets.py | 39 ++++++++++++++++--------------------- 3 files changed, 20 insertions(+), 22 deletions(-) diff --git a/docs/cli/README.md b/docs/cli/README.md index f43ce766390ad..df700fb743c06 100644 --- a/docs/cli/README.md +++ b/docs/cli/README.md @@ -77,6 +77,8 @@ vllm complete --quick "The future of AI is" Run benchmark tests for latency online serving throughput and offline inference throughput. +To use benchmark commands, please install with extra dependencies using `pip install vllm[bench]`. + Available Commands: ```bash diff --git a/setup.py b/setup.py index b07cdea302900..ea7cd0169c8bb 100644 --- a/setup.py +++ b/setup.py @@ -688,6 +688,7 @@ setup( ext_modules=ext_modules, install_requires=get_requirements(), extras_require={ + "bench": ["pandas", "datasets"], "tensorizer": ["tensorizer>=2.9.0"], "fastsafetensors": ["fastsafetensors >= 0.1.10"], "runai": ["runai-model-streamer", "runai-model-streamer-s3", "boto3"], diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index f795a12568e05..4da9f7368e631 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -24,7 +24,6 @@ from io import BytesIO from typing import Any, Callable, Optional, Union import numpy as np -import pandas as pd from PIL import Image from transformers import PreTrainedTokenizerBase @@ -33,6 +32,23 @@ from vllm.lora.utils import get_adapter_absolute_path from vllm.multimodal import MultiModalDataDict from vllm.multimodal.image import convert_image_mode from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer +from vllm.utils import PlaceholderModule + +try: + from datasets import load_dataset +except ImportError: + datasets = PlaceholderModule("datasets") + load_dataset = datasets.placeholder_attr("load_dataset") + +try: + import pandas as pd +except ImportError: + pd = PlaceholderModule("pandas") + +try: + import librosa +except ImportError: + librosa = PlaceholderModule("librosa") logger = logging.getLogger(__name__) @@ -636,13 +652,6 @@ class BurstGPTDataset(BenchmarkDataset): if self.dataset_path is None: raise ValueError("dataset_path must be provided for loading data.") - try: - import pandas as pd - except ImportError as e: - raise ImportError( - "Pandas is required for BurstGPTDataset. Please install it " - "using `pip install pandas`.") from e - df = pd.read_csv(self.dataset_path) # Filter to keep only GPT-4 rows. gpt4_df = df[df["Model"] == "GPT-4"] @@ -717,13 +726,6 @@ class HuggingFaceDataset(BenchmarkDataset): def load_data(self) -> None: """Load data from HuggingFace datasets.""" - try: - from datasets import load_dataset - except ImportError as e: - raise ImportError( - "Hugging Face datasets library is required for this dataset. " - "Please install it using `pip install datasets`.") from e - self.data = load_dataset( self.dataset_path, name=self.dataset_subset, @@ -1147,13 +1149,6 @@ class ASRDataset(HuggingFaceDataset): output_len: Optional[int] = None, **kwargs, ) -> list: - try: - import librosa - except ImportError as e: - raise ImportError( - "librosa is required for ASRDataset. Please install it " - "using `pip install librosa`.") from e - output_len = (output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN) prompt = ASRDataset.TRANSCRIPTION_PREAMBLE From 35cf32df304770b9dd3878438544b3a1a1cc79a5 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Wed, 4 Jun 2025 19:48:57 +0800 Subject: [PATCH 051/115] Improve the output precision of embedding models (#19092) --- tests/models/language/pooling/embed_utils.py | 6 +-- tests/models/language/pooling/mteb_utils.py | 12 ++--- tests/models/language/pooling/test_gte.py | 7 --- .../models/language/pooling/test_intfloat.py | 46 +++++++++++++++++++ tests/models/language/pooling/test_jina.py | 3 +- tests/models/language/pooling/test_nomic.py | 3 -- vllm/model_executor/models/bert.py | 13 ++++-- vllm/model_executor/models/bert_with_rope.py | 7 ++- 8 files changed, 69 insertions(+), 28 deletions(-) create mode 100644 tests/models/language/pooling/test_intfloat.py diff --git a/tests/models/language/pooling/embed_utils.py b/tests/models/language/pooling/embed_utils.py index 07bc9f447e336..dabd7bee7f393 100644 --- a/tests/models/language/pooling/embed_utils.py +++ b/tests/models/language/pooling/embed_utils.py @@ -56,14 +56,10 @@ def correctness_test_embed_models(hf_runner, max_model_len=None, **vllm_extra_kwargs) as vllm_model: vllm_outputs = vllm_model.encode(example_prompts) - vllm_dtype = vllm_model.model.llm_engine.model_config.dtype - model_dtype = getattr( - vllm_model.model.llm_engine.model_config.hf_config, "torch_dtype", - vllm_dtype) with hf_runner( model_info.name, - dtype=model_dtype, + dtype="float32", is_sentence_transformer=True, ) as hf_model: diff --git a/tests/models/language/pooling/mteb_utils.py b/tests/models/language/pooling/mteb_utils.py index 2705be25e7cc7..0a047951db443 100644 --- a/tests/models/language/pooling/mteb_utils.py +++ b/tests/models/language/pooling/mteb_utils.py @@ -7,7 +7,6 @@ import numpy as np import pytest from tests.models.utils import EmbedModelInfo -from vllm.model_executor.model_loader.utils import set_default_torch_dtype # Most models on the STS12 task (See #17175): # - Model implementation and minor changes in tensor dtype @@ -104,17 +103,18 @@ def mteb_test_embed_models(hf_runner, MTEB_EMBED_TASKS) vllm_dtype = vllm_model.model.llm_engine.model_config.dtype - with set_default_torch_dtype(vllm_dtype) and hf_runner( - model_info.name, is_sentence_transformer=True, - dtype=vllm_dtype) as hf_model: + with hf_runner(model_info.name, + is_sentence_transformer=True, + dtype="float32") as hf_model: if hf_model_callback is not None: hf_model_callback(hf_model) st_main_score = run_mteb_embed_task(hf_model, MTEB_EMBED_TASKS) + st_dtype = next(hf_model.model.parameters()).dtype - print("VLLM:", vllm_main_score) - print("SentenceTransformers:", st_main_score) + print("VLLM:", vllm_dtype, vllm_main_score) + print("SentenceTransformers:", st_dtype, st_main_score) print("Difference:", st_main_score - vllm_main_score) assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_EMBED_TOL) diff --git a/tests/models/language/pooling/test_gte.py b/tests/models/language/pooling/test_gte.py index 2178a815b71c8..05bd479f42b95 100644 --- a/tests/models/language/pooling/test_gte.py +++ b/tests/models/language/pooling/test_gte.py @@ -11,27 +11,21 @@ MODELS = [ ########## BertModel EmbedModelInfo("thenlper/gte-large", architecture="BertModel", - dtype="float32", enable_test=True), EmbedModelInfo("thenlper/gte-base", architecture="BertModel", - dtype="float32", enable_test=False), EmbedModelInfo("thenlper/gte-small", architecture="BertModel", - dtype="float32", enable_test=False), EmbedModelInfo("thenlper/gte-large-zh", architecture="BertModel", - dtype="float32", enable_test=False), EmbedModelInfo("thenlper/gte-base-zh", architecture="BertModel", - dtype="float32", enable_test=False), EmbedModelInfo("thenlper/gte-small-zh", architecture="BertModel", - dtype="float32", enable_test=False), ########### NewModel EmbedModelInfo("Alibaba-NLP/gte-multilingual-base", @@ -46,7 +40,6 @@ MODELS = [ ########### Qwen2ForCausalLM EmbedModelInfo("Alibaba-NLP/gte-Qwen2-1.5B-instruct", architecture="Qwen2ForCausalLM", - dtype="float32", enable_test=True), ########## ModernBertModel EmbedModelInfo("Alibaba-NLP/gte-modernbert-base", diff --git a/tests/models/language/pooling/test_intfloat.py b/tests/models/language/pooling/test_intfloat.py new file mode 100644 index 0000000000000..b6e83857fa70e --- /dev/null +++ b/tests/models/language/pooling/test_intfloat.py @@ -0,0 +1,46 @@ +# SPDX-License-Identifier: Apache-2.0 +import pytest + +from ...utils import EmbedModelInfo +from .embed_utils import correctness_test_embed_models +from .mteb_utils import mteb_test_embed_models + +MODELS = [ + ########## BertModel + EmbedModelInfo("intfloat/e5-small", + architecture="BertModel", + enable_test=True), + EmbedModelInfo("intfloat/e5-base", + architecture="BertModel", + enable_test=False), + EmbedModelInfo("intfloat/e5-large", + architecture="BertModel", + enable_test=False), + EmbedModelInfo("intfloat/multilingual-e5-small", + architecture="BertModel", + enable_test=False), + ########## XLMRobertaModel + EmbedModelInfo("intfloat/multilingual-e5-base", + architecture="XLMRobertaModel", + enable_test=True), + EmbedModelInfo("intfloat/multilingual-e5-large", + architecture="XLMRobertaModel", + enable_test=False), + EmbedModelInfo("intfloat/multilingual-e5-large-instruct", + architecture="XLMRobertaModel", + enable_test=False), +] + + +@pytest.mark.parametrize("model_info", MODELS) +def test_embed_models_mteb(hf_runner, vllm_runner, + model_info: EmbedModelInfo) -> None: + mteb_test_embed_models(hf_runner, vllm_runner, model_info) + + +@pytest.mark.parametrize("model_info", MODELS) +def test_embed_models_correctness(hf_runner, vllm_runner, + model_info: EmbedModelInfo, + example_prompts) -> None: + correctness_test_embed_models(hf_runner, vllm_runner, model_info, + example_prompts) diff --git a/tests/models/language/pooling/test_jina.py b/tests/models/language/pooling/test_jina.py index 2adf34b292872..33255021ad6ac 100644 --- a/tests/models/language/pooling/test_jina.py +++ b/tests/models/language/pooling/test_jina.py @@ -32,8 +32,7 @@ TEXTS_2 = [ EMBEDDING_MODELS = [ EmbedModelInfo("jinaai/jina-embeddings-v3", architecture="XLMRobertaModel", - is_matryoshka=True, - dtype="float32") + is_matryoshka=True) ] diff --git a/tests/models/language/pooling/test_nomic.py b/tests/models/language/pooling/test_nomic.py index 59dbd74fb6fb6..e16ec239a3381 100644 --- a/tests/models/language/pooling/test_nomic.py +++ b/tests/models/language/pooling/test_nomic.py @@ -9,18 +9,15 @@ from .mteb_utils import mteb_test_embed_models MODELS = [ EmbedModelInfo("nomic-ai/nomic-embed-text-v1", architecture="NomicBertModel", - dtype="float32", enable_test=True), EmbedModelInfo("nomic-ai/nomic-embed-text-v1.5", architecture="NomicBertModel", - dtype="float32", enable_test=False), EmbedModelInfo("nomic-ai/CodeRankEmbed", architecture="NomicBertModel", enable_test=False), EmbedModelInfo("nomic-ai/nomic-embed-text-v2-moe", architecture="NomicBertModel", - dtype="float32", enable_test=True) ] diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py index 389393987c811..cacec7342ac2e 100644 --- a/vllm/model_executor/models/bert.py +++ b/vllm/model_executor/models/bert.py @@ -414,10 +414,15 @@ class BertEmbeddingModel(nn.Module, SupportsV0Only, SupportsQuant): intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, ) -> torch.Tensor: - return self.model(input_ids=input_ids, - position_ids=positions, - inputs_embeds=inputs_embeds, - intermediate_tensors=intermediate_tensors) + hidden_states = self.model(input_ids=input_ids, + position_ids=positions, + inputs_embeds=inputs_embeds, + intermediate_tensors=intermediate_tensors) + + # convert the embedding output to float32, + # otherwise precision will be lost significantly + hidden_states = hidden_states.to(torch.float32) + return hidden_states def pooler( self, diff --git a/vllm/model_executor/models/bert_with_rope.py b/vllm/model_executor/models/bert_with_rope.py index 0f22393c79d98..d1b84a9f04fa9 100644 --- a/vllm/model_executor/models/bert_with_rope.py +++ b/vllm/model_executor/models/bert_with_rope.py @@ -432,7 +432,12 @@ class BertWithRope(nn.Module, SupportsV0Only, SupportsQuant): else: hidden_states = self.embeddings(input_ids=input_ids, token_type_ids=token_type_ids) - return self.encoder(positions, hidden_states) + hidden_states = self.encoder(positions, hidden_states) + + # convert the embedding output to float32, + # otherwise precision will be lost significantly + hidden_states = hidden_states.to(torch.float32) + return hidden_states def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: From 01dc9a76db7d314aaf51be9ffc6ff561bae5626f Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 4 Jun 2025 19:49:20 +0800 Subject: [PATCH 052/115] [CI/Build][Bugfix] Ensure compatibility with transformers 4.52 (#18678) Signed-off-by: DarkLight1337 --- requirements/test.in | 2 +- requirements/test.txt | 2 +- .../multimodal/generation/test_common.py | 9 +++- .../multimodal/generation/test_florence2.py | 2 + .../generation/test_granite_speech.py | 2 +- .../multimodal/generation/test_phi4mm.py | 4 ++ .../generation/vlm_utils/model_utils.py | 18 ++++++- .../multimodal/processing/test_common.py | 2 +- tests/models/registry.py | 47 ++++++------------- tests/models/test_initialization.py | 11 +++++ vllm/config.py | 2 + vllm/model_executor/models/aya_vision.py | 12 +++-- vllm/model_executor/models/idefics3.py | 16 +++++-- 13 files changed, 82 insertions(+), 47 deletions(-) diff --git a/requirements/test.in b/requirements/test.in index 9b574a09fcce5..bbbd41e168a60 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -34,7 +34,7 @@ opencv-python-headless >= 4.11.0 # required for video test datamodel_code_generator # required for minicpm3 test lm-eval[api]==0.4.8 # required for model evaluation test mteb>=1.38.11, <2 # required for mteb test -transformers==4.51.3 +transformers==4.52.4 tokenizers==0.21.1 huggingface-hub[hf_xet]>=0.30.0 # Required for Xet downloads. schemathesis>=3.39.15 # Required for openai schema test. diff --git a/requirements/test.txt b/requirements/test.txt index 03aec80ac1283..fb0eede080ff1 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -794,7 +794,7 @@ tqdm==4.66.6 # transformers tqdm-multiprocess==0.0.11 # via lm-eval -transformers==4.51.3 +transformers==4.52.4 # via # -r requirements/test.in # genai-perf diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py index a5bbcfc22e9cd..496850b19af4f 100644 --- a/tests/models/multimodal/generation/test_common.py +++ b/tests/models/multimodal/generation/test_common.py @@ -226,6 +226,8 @@ VLM_TEST_SETTINGS = { img_idx_to_prompt=lambda idx: "", auto_cls=AutoModelForImageTextToText, vllm_output_post_proc=model_utils.blip2_vllm_to_hf_output, + # FIXME: https://github.com/huggingface/transformers/pull/38510 + marks=[pytest.mark.skip("Model is broken")], ), "chameleon": VLMTestInfo( models=["facebook/chameleon-7b"], @@ -281,10 +283,10 @@ VLM_TEST_SETTINGS = { multi_image_prompt="Describe the two images in detail.", # noqa: E501 max_model_len=4096, max_num_seqs=2, - dtype="bfloat16", auto_cls=AutoModelForImageTextToText, vllm_runner_kwargs={"mm_processor_kwargs": {"do_pan_and_scan": True}}, patch_hf_runner=model_utils.gemma3_patch_hf_runner, + num_logprobs=10, ), "glm4v": VLMTestInfo( models=["THUDM/glm-4v-9b"], @@ -337,7 +339,8 @@ VLM_TEST_SETTINGS = { models=[ "OpenGVLab/InternVL2-1B", "OpenGVLab/InternVL2-2B", - "OpenGVLab/Mono-InternVL-2B", + # FIXME: Config cannot be loaded in transformers 4.52 + # "OpenGVLab/Mono-InternVL-2B", ], test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501 @@ -568,6 +571,8 @@ VLM_TEST_SETTINGS = { max_num_seqs=2, vllm_output_post_proc=model_utils.qwen_vllm_to_hf_output, prompt_path_encoder=model_utils.qwen_prompt_path_encoder, + # FIXME: https://github.com/huggingface/transformers/issues/38358 + marks=[pytest.mark.skip("Model initialization fails")], ), "qwen2_vl": VLMTestInfo( models=["Qwen/Qwen2-VL-2B-Instruct"], diff --git a/tests/models/multimodal/generation/test_florence2.py b/tests/models/multimodal/generation/test_florence2.py index b048cec5e5e0f..a622957f96f69 100644 --- a/tests/models/multimodal/generation/test_florence2.py +++ b/tests/models/multimodal/generation/test_florence2.py @@ -100,6 +100,8 @@ def run_test( ) +# FIXME: https://github.com/huggingface/transformers/issues/38358 +@pytest.mark.skip("Model initialization fails") @pytest.mark.core_model @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize( diff --git a/tests/models/multimodal/generation/test_granite_speech.py b/tests/models/multimodal/generation/test_granite_speech.py index 14552010d3762..c5ffa5f3a70af 100644 --- a/tests/models/multimodal/generation/test_granite_speech.py +++ b/tests/models/multimodal/generation/test_granite_speech.py @@ -29,7 +29,7 @@ def vllm_to_hf_output( return output_ids, hf_output_str, out_logprobs -MODEL_NAME = "ibm-granite/granite-speech-3.3-8b" +MODEL_NAME = "ibm-granite/granite-speech-3.3-2b" # Audio lora co-exists directly in the model directory, but # currently still needs to be passed directly to vLLM. audio_lora_path = MODEL_NAME diff --git a/tests/models/multimodal/generation/test_phi4mm.py b/tests/models/multimodal/generation/test_phi4mm.py index e4cd476a96b1d..4e8465778e256 100644 --- a/tests/models/multimodal/generation/test_phi4mm.py +++ b/tests/models/multimodal/generation/test_phi4mm.py @@ -122,6 +122,10 @@ def run_test( for prompts, images, audios in inputs ] + # This error occurs inside `get_peft_model` + # FIXME: https://huggingface.co/microsoft/Phi-4-multimodal-instruct/discussions/75 + pytest.skip("HF impl is not compatible with current transformers") + hf_model_kwargs = {"_attn_implementation": "sdpa"} with hf_runner(model, dtype=dtype, model_kwargs=hf_model_kwargs) as hf_model: diff --git a/tests/models/multimodal/generation/vlm_utils/model_utils.py b/tests/models/multimodal/generation/vlm_utils/model_utils.py index 1b087191f6363..af4c72f44b676 100644 --- a/tests/models/multimodal/generation/vlm_utils/model_utils.py +++ b/tests/models/multimodal/generation/vlm_utils/model_utils.py @@ -10,11 +10,12 @@ from typing import Optional, Union import numpy as np import numpy.typing as npt +import pytest import regex as re import torch from PIL.Image import Image from transformers import (AutoConfig, AutoTokenizer, BatchFeature, - GenerationConfig) + GenerationConfig, GenerationMixin) from vllm.sequence import SampleLogprobs from vllm.transformers_utils.tokenizer import patch_padding_side @@ -324,6 +325,16 @@ def gemma3_patch_hf_runner(hf_model: HfRunner) -> HfRunner: hf_model.processor = processor + orig_generate = hf_model.model.generate + + def _generate(self, *args, **kwargs): + # FIXME: https://github.com/huggingface/transformers/issues/38333 + kwargs["disable_compile"] = True + + return orig_generate(*args, **kwargs) + + hf_model.model.generate = types.MethodType(_generate, hf_model.model) + return hf_model @@ -610,6 +621,11 @@ def _internvl_generate( if getattr(self, "use_visual_token_mask", False): visual_token_mask = selected.reshape(B, N, 1).to(input_embeds.dtype) forward_kwargs["visual_token_mask"] = visual_token_mask + + # e.g. InternVL2-2B + if not isinstance(self.language_model, GenerationMixin): + pytest.skip("HF impl is not compatible with current transformers") + outputs = self.language_model.generate( **forward_kwargs, **generate_kwargs, diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index be574435e0995..1e6608955b31b 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -245,7 +245,7 @@ def _test_processing_correctness_one( "adept/fuyu-8b", "google/gemma-3-4b-it", "THUDM/glm-4v-9b", - "ibm-granite/granite-speech-3.3-8b", + "ibm-granite/granite-speech-3.3-2b", "h2oai/h2ovl-mississippi-800m", "OpenGVLab/InternVL2-1B", "OpenGVLab/InternVL3-1B", diff --git a/tests/models/registry.py b/tests/models/registry.py index ed49676a9f5d6..3e07dc0f322e1 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -160,17 +160,12 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { "Fairseq2LlamaForCausalLM": _HfExamplesInfo("mgleize/fairseq2-dummy-Llama-3.2-1B"), # noqa: E501 "FalconForCausalLM": _HfExamplesInfo("tiiuae/falcon-7b"), "FalconH1ForCausalLM":_HfExamplesInfo("tiiuae/Falcon-H1-1.5B-Instruct", - is_available_online=False, - min_transformers_version="4.52.2"), + min_transformers_version="4.53"), "GemmaForCausalLM": _HfExamplesInfo("google/gemma-1.1-2b-it"), "Gemma2ForCausalLM": _HfExamplesInfo("google/gemma-2-9b"), "Gemma3ForCausalLM": _HfExamplesInfo("google/gemma-3-1b-it"), "GlmForCausalLM": _HfExamplesInfo("THUDM/glm-4-9b-chat-hf"), - "Glm4ForCausalLM": _HfExamplesInfo( - "THUDM/GLM-4-32B-0414", - is_available_online=False, - min_transformers_version="4.52.dev0" - ), + "Glm4ForCausalLM": _HfExamplesInfo("THUDM/GLM-4-9B-0414"), "GPT2LMHeadModel": _HfExamplesInfo("openai-community/gpt2", {"alias": "gpt2"}), "GPTBigCodeForCausalLM": _HfExamplesInfo("bigcode/starcoder", @@ -181,8 +176,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { {"1b": "EleutherAI/pythia-1.4b"}), "GraniteForCausalLM": _HfExamplesInfo("ibm/PowerLM-3b"), "GraniteMoeForCausalLM": _HfExamplesInfo("ibm/PowerMoE-3b"), - "GraniteMoeHybridForCausalLM": _HfExamplesInfo("ibm-granite/granite-4.0-tiny-preview", # noqa: E501 - min_transformers_version="4.52.0"), # noqa: E501 + "GraniteMoeHybridForCausalLM": _HfExamplesInfo("ibm-granite/granite-4.0-tiny-preview"), # noqa: E501 "GraniteMoeSharedForCausalLM": _HfExamplesInfo("ibm-research/moe-7b-1b-active-shared-experts"), # noqa: E501 "Grok1ModelForCausalLM": _HfExamplesInfo("hpcai-tech/grok-1", trust_remote_code=True), @@ -203,8 +197,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { "LLaMAForCausalLM": _HfExamplesInfo("decapoda-research/llama-7b-hf", is_available_online=False), "MambaForCausalLM": _HfExamplesInfo("state-spaces/mamba-130m-hf"), - "Mamba2ForCausalLM": _HfExamplesInfo("mistralai/Mamba-Codestral-7B-v0.1", - is_available_online=False), + "Mamba2ForCausalLM": _HfExamplesInfo("mistralai/Mamba-Codestral-7B-v0.1"), "FalconMambaForCausalLM": _HfExamplesInfo("tiiuae/falcon-mamba-7b-instruct"), # noqa: E501 "MiniCPMForCausalLM": _HfExamplesInfo("openbmb/MiniCPM-2B-sft-bf16", trust_remote_code=True), @@ -243,10 +236,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { "Qwen2MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen1.5-MoE-A2.7B-Chat"), "Qwen3ForCausalLM": _HfExamplesInfo("Qwen/Qwen3-8B"), "Qwen3MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen3-30B-A3B"), - "RWForCausalLM": _HfExamplesInfo("tiiuae/falcon-40b", - is_available_online=False), + "RWForCausalLM": _HfExamplesInfo("tiiuae/falcon-40b"), "StableLMEpochForCausalLM": _HfExamplesInfo("stabilityai/stablelm-zephyr-3b", # noqa: E501 - is_available_online=False), + v0_only=True), "StableLmForCausalLM": _HfExamplesInfo("stabilityai/stablelm-3b-4e1t", v0_only=True), "Starcoder2ForCausalLM": _HfExamplesInfo("bigcode/starcoder2-3b"), @@ -256,7 +248,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { "TeleFLMForCausalLM": _HfExamplesInfo("CofeAI/FLM-2-52B-Instruct-2407", trust_remote_code=True), "XverseForCausalLM": _HfExamplesInfo("xverse/XVERSE-7B-Chat", - is_available_online=False, + tokenizer="meta-llama/Llama-2-7b", trust_remote_code=True), "Zamba2ForCausalLM": _HfExamplesInfo("Zyphra/Zamba2-7B-instruct"), "MiMoForCausalLM": _HfExamplesInfo("XiaomiMiMo/MiMo-7B-RL", @@ -275,8 +267,7 @@ _EMBEDDING_EXAMPLE_MODELS = { trust_remote_code=True), "GteNewModel": _HfExamplesInfo("Alibaba-NLP/gte-base-en-v1.5", trust_remote_code=True, - hf_overrides={"architectures": - ["GteNewModel"]}), + hf_overrides={"architectures": ["GteNewModel"]}), # noqa: E501 "InternLM2ForRewardModel": _HfExamplesInfo("internlm/internlm2-1_8b-reward", trust_remote_code=True), "JambaForSequenceClassification": _HfExamplesInfo("ai21labs/Jamba-tiny-reward-dev"), # noqa: E501 @@ -298,10 +289,8 @@ _EMBEDDING_EXAMPLE_MODELS = { "Phi3VForCausalLM": _HfExamplesInfo("TIGER-Lab/VLM2Vec-Full", trust_remote_code=True), "Qwen2VLForConditionalGeneration": _HfExamplesInfo("MrLight/dse-qwen2-2b-mrl-v1"), # noqa: E501 - # The model on Huggingface is currently being updated, - # hence I temporarily mark it as not available online - "PrithviGeoSpatialMAE": _HfExamplesInfo("ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11", # noqa: E501 - is_available_online=False), + "PrithviGeoSpatialMAE": _HfExamplesInfo("ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11", # noqa: E501 + is_available_online=False), # noqa: E501 } _CROSS_ENCODER_EXAMPLE_MODELS = { @@ -327,8 +316,7 @@ _MULTIMODAL_EXAMPLE_MODELS = { hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]}), # noqa: E501 "FuyuForCausalLM": _HfExamplesInfo("adept/fuyu-8b"), "Gemma3ForConditionalGeneration": _HfExamplesInfo("google/gemma-3-4b-it"), - "GraniteSpeechForConditionalGeneration": _HfExamplesInfo("ibm-granite/granite-speech-3.3-8b", # noqa: E501 - min_transformers_version="4.52.0"), # noqa: E501 + "GraniteSpeechForConditionalGeneration": _HfExamplesInfo("ibm-granite/granite-speech-3.3-2b"), # noqa: E501 "GLM4VForCausalLM": _HfExamplesInfo("THUDM/glm-4v-9b", trust_remote_code=True, hf_overrides={"architectures": ["GLM4VForCausalLM"]}), # noqa: E501 @@ -347,7 +335,6 @@ _MULTIMODAL_EXAMPLE_MODELS = { trust_remote_code=True, v0_only=True), "Llama4ForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-4-Scout-17B-16E-Instruct", # noqa: E501 - min_transformers_version="4.51", max_model_len=10240), "LlavaForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-1.5-7b-hf", extras={"mistral": "mistral-community/pixtral-12b", # noqa: E501 @@ -360,8 +347,6 @@ _MULTIMODAL_EXAMPLE_MODELS = { transformers_version_reason="HF model is not compatible.", # noqa: E501 hf_overrides={"architectures": ["MantisForConditionalGeneration"]}), # noqa: E501 "MiniCPMO": _HfExamplesInfo("openbmb/MiniCPM-o-2_6", - max_transformers_version="4.48", - transformers_version_reason="Use of deprecated imports which have been removed.", # noqa: E501 trust_remote_code=True), "MiniCPMV": _HfExamplesInfo("openbmb/MiniCPM-Llama3-V-2_5", extras={"2.6": "openbmb/MiniCPM-V-2_6"}, # noqa: E501 @@ -399,10 +384,8 @@ _MULTIMODAL_EXAMPLE_MODELS = { "Qwen2AudioForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-Audio-7B-Instruct"), # noqa: E501 "Qwen2VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-VL-2B-Instruct"), # noqa: E501 "Qwen2_5_VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-VL-3B-Instruct"), # noqa: E501 - "Qwen2_5OmniModel": _HfExamplesInfo("Qwen/Qwen2.5-Omni-3B", - min_transformers_version="4.52"), - "Qwen2_5OmniForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-Omni-7B-AWQ", # noqa: E501 - min_transformers_version="4.52"), + "Qwen2_5OmniModel": _HfExamplesInfo("Qwen/Qwen2.5-Omni-3B"), + "Qwen2_5OmniForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-Omni-7B-AWQ"), # noqa: E501 "SkyworkR1VChatModel": _HfExamplesInfo("Skywork/Skywork-R1V-38B"), "SmolVLMForConditionalGeneration": _HfExamplesInfo("HuggingFaceTB/SmolVLM2-2.2B-Instruct"), # noqa: E501 "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_5-llama-3_2-1b", # noqa: E501 @@ -413,8 +396,8 @@ _MULTIMODAL_EXAMPLE_MODELS = { # Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer # Therefore, we borrow the BartTokenizer from the original Bart model "Florence2ForConditionalGeneration": _HfExamplesInfo("microsoft/Florence-2-base", # noqa: E501 - tokenizer="Isotr0py/Florence-2-tokenizer", - trust_remote_code=True,), # noqa: E501 + tokenizer="Isotr0py/Florence-2-tokenizer", # noqa: E501 + trust_remote_code=True), # noqa: E501 "MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"), # noqa: E501 "WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"), # noqa: E501 } diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py index af023d9034383..98a58d01e2a18 100644 --- a/tests/models/test_initialization.py +++ b/tests/models/test_initialization.py @@ -21,6 +21,10 @@ def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch): model_info.check_available_online(on_fail="skip") model_info.check_transformers_version(on_fail="skip") + # FIXME: Possible memory leak in the previous tests? + if model_arch == "GraniteSpeechForConditionalGeneration": + pytest.skip("Avoid OOM") + # Avoid OOM and reduce initialization time by only using 1 layer def hf_overrides(hf_config: PretrainedConfig) -> PretrainedConfig: hf_config.update(model_info.hf_overrides) @@ -41,6 +45,13 @@ def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch): "num_hidden_layers": 1, }) + # e.g.: ibm-granite/granite-speech-3.3-2b + if hasattr(hf_config, "encoder_config"): + hf_config.encoder_config.update({ + "num_layers": 1, + "num_hidden_layers": 1, + }) + return hf_config # Avoid calling model.forward() diff --git a/vllm/config.py b/vllm/config.py index f6ca9328b8a19..a07c41ddab198 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -3139,6 +3139,8 @@ def _find_dtype( config_dtype = getattr(config.get_text_config(), "torch_dtype", None) if config_dtype is None and hasattr(config, "vision_config"): config_dtype = getattr(config.vision_config, "torch_dtype", None) + if config_dtype is None and hasattr(config, "encoder_config"): + config_dtype = getattr(config.encoder_config, "torch_dtype", None) # Try to read the dtype of the weights if they are in safetensors format if config_dtype is None: diff --git a/vllm/model_executor/models/aya_vision.py b/vllm/model_executor/models/aya_vision.py index 22efb707af738..7e15e57a4d032 100644 --- a/vllm/model_executor/models/aya_vision.py +++ b/vllm/model_executor/models/aya_vision.py @@ -111,7 +111,13 @@ class AyaVisionProcessingInfo(BaseProcessingInfo): return self.ctx.get_hf_config(AyaVisionConfig) def get_hf_processor(self, **kwargs: object) -> AyaVisionProcessor: - return self.ctx.get_hf_processor(AyaVisionProcessor, **kwargs) + processor = self.ctx.get_hf_processor(AyaVisionProcessor, **kwargs) + + # Temporary workaround since this processor has multiple image tokens + # See https://github.com/huggingface/transformers/issues/38350 + processor._check_special_mm_tokens = lambda *args, **kwargs: None + + return processor def get_image_processor(self) -> GotOcr2ImageProcessor: return self.get_hf_processor().image_processor @@ -188,9 +194,7 @@ class AyaVisionMultiModalProcessor( image_processor = hf_processor.image_processor # HF processor pops the `num_patches` kwarg, which is needed by vLLM - if (images := - mm_data.get("images")) is not None and '' in prompt: - assert isinstance(images, list) + if (images := mm_data.get("images")) is not None: parsed_images = (self._get_data_parser().parse_mm_data({ "image": images diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py index 4bc5e2a0cfaea..de8596282ca9c 100644 --- a/vllm/model_executor/models/idefics3.py +++ b/vllm/model_executor/models/idefics3.py @@ -22,8 +22,8 @@ from typing import Literal, Optional, TypedDict, Union import torch from torch import nn -from transformers import (BatchFeature, Idefics3Config, Idefics3ImageProcessor, - Idefics3Processor) +from transformers import (AddedToken, BatchFeature, Idefics3Config, + Idefics3ImageProcessor, Idefics3Processor) from vllm.config import VllmConfig from vllm.model_executor.layers.linear import ReplicatedLinear @@ -199,13 +199,21 @@ class Idefics3ProcessingInfo(BaseProcessingInfo): return grid_w * grid_h + 1 + # TODO: Remove after requiring transformers>=4.52 + def _get_content(self, token: Union[AddedToken, str]) -> str: + if isinstance(token, str): + return token + + return token.content + def _get_image_token( self, processor: Optional[Idefics3Processor]) -> tuple[str, str, str]: if processor is None: processor = self.get_hf_processor() - image_token = processor.image_token.content - fake_image_token = processor.fake_image_token.content + + image_token = self._get_content(processor.image_token) + fake_image_token = self._get_content(processor.fake_image_token) global_image_token = processor.global_image_tag return image_token, fake_image_token, global_image_token From 02658c2dfed40acaf04c8d2470b3493e8fead523 Mon Sep 17 00:00:00 2001 From: Xu Wenqing <121550081+Xu-Wenqing@users.noreply.github.com> Date: Wed, 4 Jun 2025 21:24:18 +0800 Subject: [PATCH 053/115] Add DeepSeek-R1-0528 function call chat template (#18874) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 许文卿 --- docs/features/tool_calling.md | 6 +- examples/tool_chat_template_deepseekr1.jinja | 92 ++++++++++++++++++++ 2 files changed, 96 insertions(+), 2 deletions(-) create mode 100644 examples/tool_chat_template_deepseekr1.jinja diff --git a/docs/features/tool_calling.md b/docs/features/tool_calling.md index 6ee1060dd050a..3547069f724dc 100644 --- a/docs/features/tool_calling.md +++ b/docs/features/tool_calling.md @@ -238,9 +238,11 @@ Flags: `--tool-call-parser hermes` ### DeepSeek-V3 Models (`deepseek_v3`) Supported models: -* `deepseek-ai/DeepSeek-V3-0324` -Flags: `--tool-call-parser deepseek_v3 --chat-template examples/tool_chat_template_deepseekv3.jinja` +* `deepseek-ai/DeepSeek-V3-0324` (use with ) +* `deepseek-ai/DeepSeek-R1-0528` (use with ) + +Flags: `--tool-call-parser deepseek_v3 --chat-template {see_above}` ### Models with Pythonic Tool Calls (`pythonic`) diff --git a/examples/tool_chat_template_deepseekr1.jinja b/examples/tool_chat_template_deepseekr1.jinja new file mode 100644 index 0000000000000..9ae19341fc48a --- /dev/null +++ b/examples/tool_chat_template_deepseekr1.jinja @@ -0,0 +1,92 @@ +{% if not add_generation_prompt is defined %} + {% set add_generation_prompt = false %} +{% endif %} +{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='', is_first_sp=true, is_last_user=false) %} +{%- for message in messages %} + {%- if message['role'] == 'system' %} + {%- if ns.is_first_sp %} + {% set ns.system_prompt = ns.system_prompt + message['content'] %} + {% set ns.is_first_sp = false %} + {%- else %} + {% set ns.system_prompt = ns.system_prompt + '\n\n' + message['content'] %} + {%- endif %} + {%- endif %} +{%- endfor %} + +{#- Adapted from https://github.com/sgl-project/sglang/blob/main/examples/chat_template/tool_chat_template_deepseekr1.jinja #} +{% if tools is defined and tools is not none %} + {% set tool_ns = namespace(text='You are a helpful assistant with tool calling capabilities. ' + 'When a tool call is needed, you MUST use the following format to issue the call:\n' + '<|tool▁calls▁begin|><|tool▁call▁begin|>function<|tool▁sep|>FUNCTION_NAME\n' + '```json\n{"param1": "value1", "param2": "value2"}\n```<|tool▁call▁end|><|tool▁calls▁end|>\n\n' + 'Make sure the JSON is valid.' + '## Tools\n\n### Function\n\nYou have the following functions available:\n\n') %} + {% for tool in tools %} + {% set tool_ns.text = tool_ns.text + '\n```json\n' + (tool | tojson) + '\n```\n' %} + {% endfor %} + {% set ns.system_prompt = ns.system_prompt + '\n\n' + tool_ns.text %} +{% endif %} + +{{ bos_token }} +{{ ns.system_prompt }} +{%- for message in messages %} + {% set content = message['content'] %} + {%- if message['role'] == 'user' %} + {%- set ns.is_tool = false -%} + {%- set ns.is_first = false -%} + {%- set ns.is_last_user = true -%} + {{'<|User|>' + content + '<|Assistant|>'}} + {%- endif %} + {%- if message['role'] == 'assistant' %} + {% if '' in content %} + {% set content = content.split('')[-1] %} + {% endif %} + {% endif %} + {%- if message['role'] == 'assistant' and message['tool_calls'] is defined and message['tool_calls'] is not none %} + {%- set ns.is_last_user = false -%} + {%- if ns.is_tool %} + {{'<|tool▁outputs▁end|>'}} + {%- endif %} + {%- set ns.is_first = false %} + {%- set ns.is_tool = false -%} + {%- set ns.is_output_first = true %} + {%- for tool in message['tool_calls'] %} + {%- if not ns.is_first %} + {%- if content is none %} + {{'<|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments']|tojson + '\n' + '```' + '<|tool▁call▁end|>'}} + {%- else %} + {{content + '<|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments']|tojson + '\n' + '```' + '<|tool▁call▁end|>'}} + {%- endif %} + {%- set ns.is_first = true -%} + {%- else %} + {{'\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments']|tojson + '\n' + '```' + '<|tool▁call▁end|>'}} + {%- endif %} + {%- endfor %} + {{'<|tool▁calls▁end|><|end▁of▁sentence|>'}} + {%- endif %} + {%- if message['role'] == 'assistant' and (message['tool_calls'] is not defined or message['tool_calls'] is none)%} + {%- set ns.is_last_user = false -%} + {%- if ns.is_tool %} + {{'<|tool▁outputs▁end|>' + content + '<|end▁of▁sentence|>'}} + {%- set ns.is_tool = false -%} + {%- else %} + {{content + '<|end▁of▁sentence|>'}} + {%- endif %} + {%- endif %} + {%- if message['role'] == 'tool' %} + {%- set ns.is_last_user = false -%} + {%- set ns.is_tool = true -%} + {%- if ns.is_output_first %} + {{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + content + '<|tool▁output▁end|>'}} + {%- set ns.is_output_first = false %} + {%- else %} + {{'\n<|tool▁output▁begin|>' + content + '<|tool▁output▁end|>'}} + {%- endif %} + {%- endif %} +{%- endfor -%} +{% if ns.is_tool %} + {{'<|tool▁outputs▁end|>'}} +{% endif %} +{% if add_generation_prompt and not ns.is_last_user and not ns.is_tool %} + {{'<|Assistant|>'}} +{% endif %} From 5f2cd251d212eed3052c5406875eb26811335d3e Mon Sep 17 00:00:00 2001 From: Lain Date: Wed, 4 Jun 2025 07:48:45 -0700 Subject: [PATCH 054/115] Sm100 blockwise fp8 swap ab (#18564) --- .../c3x/scaled_mm_blockwise_sm100_fp8.cu | 4 - ...scaled_mm_blockwise_sm100_fp8_dispatch.cuh | 204 ++++++++++++------ .../layers/quantization/utils/fp8_utils.py | 14 -- 3 files changed, 139 insertions(+), 83 deletions(-) diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu index 84492553c02f2..4a8a5ed02d6ce 100644 --- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu +++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu @@ -9,10 +9,6 @@ void cutlass_scaled_mm_blockwise_sm100_fp8(torch::Tensor& out, torch::Tensor const& b, torch::Tensor const& a_scales, torch::Tensor const& b_scales) { - TORCH_CHECK( - a.size(0) % 4 == 0, - "Input tensor must have a number of rows that is a multiple of 4. ", - "but got: ", a.size(0), " rows."); if (out.dtype() == torch::kBFloat16) { cutlass_gemm_blockwise_sm100_fp8_dispatch( out, a, b, a_scales, b_scales); diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh index ef324364c6d5e..c841125dbb734 100644 --- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh +++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh @@ -1,5 +1,6 @@ #pragma once +#include "cuda_utils.h" #include "cutlass/cutlass.h" #include "cutlass/numeric_types.h" @@ -22,49 +23,49 @@ namespace vllm { using namespace cute; -template +// clang-format off +template struct cutlass_3x_gemm_fp8_blockwise { + static constexpr bool swap_ab = swap_ab_; using ElementAB = cutlass::float_e4m3_t; using ElementA = ElementAB; using LayoutA = cutlass::layout::RowMajor; + using LayoutA_Transpose = typename cutlass::layout::LayoutTranspose::type; static constexpr int AlignmentA = 128 / cutlass::sizeof_bits::value; using ElementB = ElementAB; using LayoutB = cutlass::layout::ColumnMajor; + using LayoutB_Transpose = typename cutlass::layout::LayoutTranspose::type; static constexpr int AlignmentB = 128 / cutlass::sizeof_bits::value; - using ElementC = void; using ElementD = OutType; using LayoutD = cutlass::layout::RowMajor; + using LayoutD_Transpose = typename cutlass::layout::LayoutTranspose::type; static constexpr int AlignmentD = 128 / cutlass::sizeof_bits::value; + using ElementC = void; // TODO: support bias using LayoutC = LayoutD; + using LayoutC_Transpose = LayoutD_Transpose; static constexpr int AlignmentC = AlignmentD; using ElementAccumulator = float; using ElementCompute = float; using ElementBlockScale = float; - // MMA and Cluster Tile Shapes - // Shape of the tile computed by tcgen05 MMA, could be across 2 SMs if Cluster - // Shape %2 == 0 using MmaTileShape_MNK = Shape<_128,_128,_128>; - static constexpr int ScaleMsPerTile = size<0>(ScalesPerTile{}); - static constexpr int ScaleGranularityM = - size<0>(MmaTileShape{}) / ScaleMsPerTile; - static constexpr int ScaleGranularityN = - size<1>(MmaTileShape{}) / size<1>(ScalesPerTile{}); - static constexpr int ScaleGranularityK = - size<2>(MmaTileShape{}) / size<2>(ScalesPerTile{}); + using ScaleConfig = conditional_t, + cutlass::detail::Sm100BlockwiseScaleConfig< + ScaleGranularityM, ScaleGranularityN, ScaleGranularityK, + cute::UMMA::Major::MN, cute::UMMA::Major::K>>; - // Shape of the threadblocks in a cluster - using ClusterShape_MNK = ClusterShape; - - using ScaleConfig = cutlass::detail::Sm100BlockwiseScaleConfig< - ScaleGranularityM, ScaleGranularityN, ScaleGranularityK, - cute::UMMA::Major::MN, cute::UMMA::Major::K>; + // layout_SFA and layout_SFB cannot be swapped since they are deduced. using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA()); using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB()); @@ -73,7 +74,6 @@ struct cutlass_3x_gemm_fp8_blockwise { static constexpr auto RoundStyle = cutlass::FloatRoundStyle::round_to_nearest; using ElementScalar = float; - // clang-format off using DefaultOperation = cutlass::epilogue::fusion::LinearCombination; using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< ArchTag, @@ -84,33 +84,47 @@ struct cutlass_3x_gemm_fp8_blockwise { ElementAccumulator, ElementCompute, ElementC, - LayoutC, + conditional_t, AlignmentC, ElementD, - LayoutD, + conditional_t, AlignmentD, EpilogueScheduler, DefaultOperation >::CollectiveOp; using StageCountType = cutlass::gemm::collective::StageCountAuto; - using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder< - ArchTag, - OperatorClass, - ElementA, - cute::tuple, - AlignmentA, - ElementB, - cute::tuple, - AlignmentB, - ElementAccumulator, - MmaTileShape, - ClusterShape, - + using CollectiveMainloop = conditional_t, + AlignmentB, + ElementA, + cute::tuple, + AlignmentA, + ElementAccumulator, + MmaTileShape, + ClusterShape, cutlass::gemm::collective::StageCountAutoCarveout(sizeof(typename CollectiveEpilogue::SharedStorage))>, - MainloopScheduler - >::CollectiveOp; - // clang-format on + MainloopScheduler + >::CollectiveOp, + typename cutlass::gemm::collective::CollectiveBuilder< + ArchTag, + OperatorClass, + ElementA, + cute::tuple, + AlignmentA, + ElementB, + cute::tuple, + AlignmentB, + ElementAccumulator, + MmaTileShape, + ClusterShape, + cutlass::gemm::collective::StageCountAutoCarveout(sizeof(typename CollectiveEpilogue::SharedStorage))>, + MainloopScheduler + >::CollectiveOp>; using KernelType = enable_sm100_only, CollectiveMainloop, CollectiveEpilogue>>; @@ -123,6 +137,7 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, torch::Tensor const& b_scales) { + static constexpr bool swap_ab = Gemm::swap_ab; using GemmKernel = typename Gemm::GemmKernel; using StrideA = typename Gemm::GemmKernel::StrideA; using StrideB = typename Gemm::GemmKernel::StrideB; @@ -136,7 +151,6 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a, using ElementD = typename Gemm::ElementD; int32_t m = a.size(0), n = b.size(1), k = a.size(1); - auto prob_shape = cute::make_shape(m, n, k, 1); StrideA a_stride; StrideB b_stride; @@ -146,11 +160,13 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a, b_stride = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(n, k, 1)); c_stride = - cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(m, n, 1)); + cutlass::make_cute_packed_stride(StrideC{}, swap_ab ? cute::make_shape(n, m, 1) : cute::make_shape(m, n, 1)); - LayoutSFA layout_SFA = + LayoutSFA layout_SFA = swap_ab ? + ScaleConfig::tile_atom_to_shape_SFA(make_shape(n, m, k, 1)) : ScaleConfig::tile_atom_to_shape_SFA(make_shape(m, n, k, 1)); - LayoutSFB layout_SFB = + LayoutSFB layout_SFB = swap_ab ? + ScaleConfig::tile_atom_to_shape_SFB(make_shape(n, m, k, 1)) : ScaleConfig::tile_atom_to_shape_SFB(make_shape(m, n, k, 1)); auto a_ptr = static_cast(a.data_ptr()); @@ -158,9 +174,22 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a, auto a_scales_ptr = static_cast(a_scales.data_ptr()); auto b_scales_ptr = static_cast(b_scales.data_ptr()); - typename GemmKernel::MainloopArguments mainloop_args{ - a_ptr, a_stride, b_ptr, b_stride, - a_scales_ptr, layout_SFA, b_scales_ptr, layout_SFB}; + auto mainloop_args = [&](){ + // layout_SFA and layout_SFB cannot be swapped since they are deduced. + if (swap_ab) { + return typename GemmKernel::MainloopArguments{ + b_ptr, b_stride, a_ptr, a_stride, + b_scales_ptr, layout_SFA, a_scales_ptr, layout_SFB + }; + } + else { + return typename GemmKernel::MainloopArguments{ + a_ptr, a_stride, b_ptr, b_stride, + a_scales_ptr, layout_SFA, b_scales_ptr, layout_SFB + }; + } + }(); + auto prob_shape = swap_ab ? cute::make_shape(n, m, k, 1) : cute::make_shape(m, n, k, 1); auto c_ptr = static_cast(out.data_ptr()); typename GemmKernel::EpilogueArguments epilogue_args{ @@ -175,29 +204,74 @@ void cutlass_gemm_blockwise_sm100_fp8_dispatch(torch::Tensor& out, torch::Tensor const& b, torch::Tensor const& a_scales, torch::Tensor const& b_scales) { - auto m = a.size(0); - auto k = a.size(1); - auto n = b.size(1); - int sms; + int32_t m = a.size(0), n = b.size(1), k = a.size(1), sms; cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, a.get_device()); - auto should_use_2sm = [&sms](int m, int n, int tile1SM = 128) { - return std::ceil(static_cast(m) / tile1SM) * - std::ceil(static_cast(n) / tile1SM) >= - sms; - }; - bool use_2sm = should_use_2sm(m, n); - if (use_2sm) { - cutlass_gemm_caller_blockwise, Shape<_256, _1, _1>, - Shape<_2, _2, _1>, cutlass::epilogue::TmaWarpSpecialized2Sm, - cutlass::gemm::KernelTmaWarpSpecializedBlockwise2SmSm100>>( - out, a, b, a_scales, b_scales); + constexpr int TILE_K = 128; + // TODO: better heuristics + bool swap_ab = (m < 16) || (m % 4 != 0); + bool use_tma_epilogue = (m * n) % 4 == 0; + if (!swap_ab) { + constexpr int TILE_N = 128; + int tile_m = 256; + if (cuda_utils::ceil_div(n, TILE_N) * cuda_utils::ceil_div(m, 64) <= sms) { + tile_m = 64; + } + else if (cuda_utils::ceil_div(n, TILE_N) * cuda_utils::ceil_div(m, 128) <= sms) { + tile_m = 128; + } + if (tile_m == 64) { + if (use_tma_epilogue) { + cutlass_gemm_caller_blockwise, Int>, + Shape<_1, _1, _1>, cutlass::epilogue::TmaWarpSpecialized1Sm, + cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>( + out, a, b, a_scales, b_scales); + } else { + cutlass_gemm_caller_blockwise, Int>, + Shape<_1, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized1Sm, + cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>( + out, a, b, a_scales, b_scales); + } + } else if (tile_m == 128) { + if (use_tma_epilogue) { + cutlass_gemm_caller_blockwise, Int>, + Shape<_1, _1, _1>, cutlass::epilogue::TmaWarpSpecialized1Sm, + cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>( + out, a, b, a_scales, b_scales); + } else { + cutlass_gemm_caller_blockwise, Int>, + Shape<_1, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized1Sm, + cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>( + out, a, b, a_scales, b_scales); + } + } else { // tile_m == 256 + if (use_tma_epilogue) { + cutlass_gemm_caller_blockwise, Int>, + Shape<_2, _1, _1>, cutlass::epilogue::TmaWarpSpecialized2Sm, + cutlass::gemm::KernelTmaWarpSpecializedBlockwise2SmSm100>>( + out, a, b, a_scales, b_scales); + } else { + cutlass_gemm_caller_blockwise, Int>, + Shape<_2, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized2Sm, + cutlass::gemm::KernelTmaWarpSpecializedBlockwise2SmSm100>>( + out, a, b, a_scales, b_scales); + } + } } else { + // TODO: Test more tile N configs + constexpr int TILE_M = 128; + constexpr int TILE_N = 16; + // TMA epilogue isn't compatible with Swap A/B cutlass_gemm_caller_blockwise, Shape<_128, _1, _1>, - Shape<_1, _1, _1>, cutlass::epilogue::TmaWarpSpecialized1Sm, - cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>( + OutType, TILE_M, 1, TILE_K, Shape, Int, Int>, + Shape<_1, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized1Sm, + cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100, true>>( out, a, b, a_scales, b_scales); } } diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py index 1ebd2a8985824..270979c8e932e 100644 --- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py @@ -136,24 +136,10 @@ def apply_w8a8_block_fp8_linear( use_cutlass, use_aiter_and_is_supported) if use_cutlass: - rows, cols = input_2d.shape - # Blackwell GPUs (SM100) require row dimensions to be multiple of 4 for - # optimal tensor core usage. Can be removed when targeting platforms - # without this constraint. - should_pad = current_platform.has_device_capability( - 100) and rows % 4 != 0 - if should_pad: - input_2d = torch.nn.functional.pad(input_2d, - (0, 0, 0, 4 - (rows % 4)), - value=0).contiguous() - q_input, x_scale = per_token_group_quant_fp8( input_2d, block_size[1], column_major_scales=use_cutlass) - output = w8a8_blockscale_func(q_input, weight, x_scale, weight_scale, block_size, input.dtype) - if should_pad: - output = output[:rows, :] else: q_input, x_scale = per_token_group_quant_fp8( From 8f4ffbd373cb19e8f8dcfa6dec1dbbe98fbeae96 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 4 Jun 2025 22:57:55 +0800 Subject: [PATCH 055/115] [Doc] Update V1 Guide for embedding models (#19141) Signed-off-by: DarkLight1337 --- docs/usage/v1_guide.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/usage/v1_guide.md b/docs/usage/v1_guide.md index 7c4909cb5d913..baeb5411bcfdf 100644 --- a/docs/usage/v1_guide.md +++ b/docs/usage/v1_guide.md @@ -55,7 +55,7 @@ This living user guide outlines a few known **important changes and limitations* | **Spec Decode** | 🚧 WIP ([PR #13933](https://github.com/vllm-project/vllm/pull/13933))| | **Prompt Logprobs with Prefix Caching** | 🟡 Planned ([RFC #13414](https://github.com/vllm-project/vllm/issues/13414))| | **Structured Output Alternative Backends** | 🟡 Planned | -| **Embedding Models** | 🚧 WIP ([PR #18015](https://github.com/vllm-project/vllm/pull/18015)) | +| **Embedding Models** | 🚧 WIP ([PR #16188](https://github.com/vllm-project/vllm/pull/16188)) | | **Mamba Models** | 🟡 Planned | | **Encoder-Decoder Models** | 🟠 Delayed | | **Request-level Structured Output Backend** | 🔴 Deprecated | @@ -145,9 +145,9 @@ vLLM V1 currently excludes model architectures with the `SupportsV0Only` protoco and the majority fall into the following categories. V1 support for these models will be added eventually. **Embedding Models** -Initially, we will create a [separate model runner](https://github.com/vllm-project/vllm/pull/18015) to provide V1 support without conflicting with other ongoing work. +The initial support will be provided by [PR #16188](https://github.com/vllm-project/vllm/pull/16188). -Later, we will consider using [hidden states processor](https://github.com/vllm-project/vllm/issues/12249), which is based on [global logits processor](https://github.com/vllm-project/vllm/pull/13360) to enable simultaneous generation and embedding using the same engine instance in V1. [PR #16188](https://github.com/vllm-project/vllm/pull/16188) is the first step towards enabling this. +Later, we will consider using [hidden states processor](https://github.com/vllm-project/vllm/issues/12249), which is based on [global logits processor](https://github.com/vllm-project/vllm/pull/13360) to enable simultaneous generation and embedding using the same engine instance in V1. **Mamba Models** Models using selective state-space mechanisms (instead of standard transformer attention) From c8dcc159214a20650451dcd64b226f56671763f1 Mon Sep 17 00:00:00 2001 From: jmswen Date: Wed, 4 Jun 2025 08:26:47 -0700 Subject: [PATCH 056/115] Allow AsyncLLMEngine.generate to target a specific DP rank (#19102) Signed-off-by: Jon Swenson --- .../multi_instance_data_parallel.py | 58 +++++++++++++++++++ tests/tokenization/test_detokenize.py | 3 +- tests/v1/engine/test_engine_core.py | 1 + tests/v1/engine/test_engine_core_client.py | 1 + tests/v1/engine/test_output_processor.py | 5 ++ vllm/engine/async_llm_engine.py | 12 +++- vllm/v1/engine/__init__.py | 1 + vllm/v1/engine/async_llm.py | 5 +- vllm/v1/engine/core_client.py | 14 ++++- vllm/v1/engine/processor.py | 2 + 10 files changed, 97 insertions(+), 5 deletions(-) create mode 100644 examples/online_serving/multi_instance_data_parallel.py diff --git a/examples/online_serving/multi_instance_data_parallel.py b/examples/online_serving/multi_instance_data_parallel.py new file mode 100644 index 0000000000000..62b1ec71af14d --- /dev/null +++ b/examples/online_serving/multi_instance_data_parallel.py @@ -0,0 +1,58 @@ +# SPDX-License-Identifier: Apache-2.0 +import asyncio +from typing import Optional + +from vllm.engine.arg_utils import AsyncEngineArgs +from vllm.engine.async_llm_engine import AsyncLLMEngine +from vllm.outputs import RequestOutput +from vllm.sampling_params import SamplingParams + +""" +To run this example, run the following commands simultaneously with +different CUDA_VISIBLE_DEVICES: + python examples/online_serving/multi_instance_data_parallel.py + + vllm serve ibm-research/PowerMoE-3b -dp 2 -dpr 1 \ + --data-parallel-address 127.0.0.1 --data-parallel-rpc-port 62300 \ + --data-parallel-size-local 1 --enforce-eager --headless + +Once both instances have completed the handshake, this example will +send a request to the instance with DP rank 1. +""" + + +async def main(): + engine_args = AsyncEngineArgs( + model="ibm-research/PowerMoE-3b", + data_parallel_size=2, + dtype="auto", + max_model_len=2048, + data_parallel_address="127.0.0.1", + data_parallel_rpc_port=62300, + data_parallel_size_local=1, + enforce_eager=True, + ) + + engine_client = AsyncLLMEngine.from_engine_args(engine_args) + + sampling_params = SamplingParams( + temperature=0.7, + top_p=0.9, + max_tokens=100, + ) + + prompt = "Who won the 2004 World Series?" + final_output: Optional[RequestOutput] = None + async for output in engine_client.generate( + prompt=prompt, + sampling_params=sampling_params, + request_id="abcdef", + data_parallel_rank=1, + ): + final_output = output + if final_output: + print(final_output.outputs[0].text) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py index b289dc972c89b..9f2414eca24f3 100644 --- a/tests/tokenization/test_detokenize.py +++ b/tests/tokenization/test_detokenize.py @@ -70,7 +70,8 @@ def _run_incremental_decode(tokenizer, None, 0.0, None, - cache_salt=None) + cache_salt=None, + data_parallel_rank=None) if fast is None: detokenizer = IncrementalDetokenizer.from_new_request( diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py index 3d7632a6037f7..1cbbf30371afd 100644 --- a/tests/v1/engine/test_engine_core.py +++ b/tests/v1/engine/test_engine_core.py @@ -42,6 +42,7 @@ def make_request() -> EngineCoreRequest: arrival_time=time.time(), lora_request=None, cache_salt=None, + data_parallel_rank=None, ) diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py index 47181d36f4ccc..c2dc3b4731b5a 100644 --- a/tests/v1/engine/test_engine_core_client.py +++ b/tests/v1/engine/test_engine_core_client.py @@ -56,6 +56,7 @@ def make_request( arrival_time=time.time(), lora_request=None, cache_salt=None, + data_parallel_rank=None, ) diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py index a83454ee67e73..6b88b0cf17e32 100644 --- a/tests/v1/engine/test_output_processor.py +++ b/tests/v1/engine/test_output_processor.py @@ -59,6 +59,7 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind, eos_token_id=None, lora_request=None, cache_salt=None, + data_parallel_rank=None, sampling_params=SamplingParams( skip_special_tokens=False, spaces_between_special_tokens=False, @@ -406,6 +407,7 @@ def test_logprobs_processor(request_output_kind: RequestOutputKind, eos_token_id=None, lora_request=None, cache_salt=None, + data_parallel_rank=None, sampling_params=SamplingParams( skip_special_tokens=False, spaces_between_special_tokens=False, @@ -569,6 +571,7 @@ def test_stop_token(include_stop_str_in_output: bool, eos_token_id=eos_token_id, lora_request=None, cache_salt=None, + data_parallel_rank=None, sampling_params=SamplingParams( skip_special_tokens=False, spaces_between_special_tokens=False, @@ -666,6 +669,7 @@ def test_stop_string(include_stop_str_in_output: bool, eos_token_id=None, lora_request=None, cache_salt=None, + data_parallel_rank=None, sampling_params=SamplingParams( skip_special_tokens=False, spaces_between_special_tokens=False, @@ -780,6 +784,7 @@ def test_iteration_stats(dummy_test_vectors): eos_token_id=None, lora_request=None, cache_salt=None, + data_parallel_rank=None, sampling_params=SamplingParams(), ) for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens) ] diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 6d8d97cf5feba..59971f5d65afa 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -442,6 +442,7 @@ class _AsyncLLMEngine(LLMEngine): trace_headers: Optional[Mapping[str, str]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, priority: int = 0, + data_parallel_rank: Optional[int] = None, ) -> None: ... @@ -456,6 +457,7 @@ class _AsyncLLMEngine(LLMEngine): trace_headers: Optional[Mapping[str, str]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, priority: int = 0, + data_parallel_rank: Optional[int] = None, ) -> None: ... @@ -473,6 +475,7 @@ class _AsyncLLMEngine(LLMEngine): trace_headers: Optional[Mapping[str, str]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, priority: int = 0, + data_parallel_rank: Optional[int] = None, *, inputs: Optional[PromptType] = None, # DEPRECATED ) -> None: @@ -902,6 +905,7 @@ class AsyncLLMEngine(EngineClient): trace_headers: Optional[Mapping[str, str]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, priority: int = 0, + data_parallel_rank: Optional[int] = None, ) -> Coroutine[None, None, AsyncGenerator[Union[ RequestOutput, PoolingRequestOutput], None]]: ... @@ -917,6 +921,7 @@ class AsyncLLMEngine(EngineClient): trace_headers: Optional[Mapping[str, str]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, priority: int = 0, + data_parallel_rank: Optional[int] = None, ) -> Coroutine[None, None, AsyncGenerator[Union[ RequestOutput, PoolingRequestOutput], None]]: ... @@ -935,6 +940,7 @@ class AsyncLLMEngine(EngineClient): trace_headers: Optional[Mapping[str, str]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, priority: int = 0, + data_parallel_rank: Optional[int] = None, *, inputs: Optional[PromptType] = None, # DEPRECATED ) -> AsyncGenerator[Union[RequestOutput, PoolingRequestOutput], None]: @@ -967,6 +973,7 @@ class AsyncLLMEngine(EngineClient): trace_headers=trace_headers, prompt_adapter_request=prompt_adapter_request, priority=priority, + data_parallel_rank=data_parallel_rank, ) return stream.generator() @@ -980,6 +987,7 @@ class AsyncLLMEngine(EngineClient): trace_headers: Optional[Mapping[str, str]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, priority: int = 0, + data_parallel_rank: Optional[int] = None, ) -> AsyncGenerator[RequestOutput, None]: """Generate outputs for a request. @@ -999,7 +1007,8 @@ class AsyncLLMEngine(EngineClient): for generation, if any. priority: The priority of the request. Only applicable with priority scheduling. - + data_parallel_rank: The (global) data parallel rank that must + handle this request. Only applicable if DP is enabled. Yields: The output `RequestOutput` objects from the LLMEngine for the request. @@ -1057,6 +1066,7 @@ class AsyncLLMEngine(EngineClient): trace_headers=trace_headers, prompt_adapter_request=prompt_adapter_request, priority=priority, + data_parallel_rank=data_parallel_rank, ): yield LLMEngine.validate_output(output, RequestOutput) except asyncio.CancelledError: diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index d1bec25237d62..59463f1ba99f5 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -55,6 +55,7 @@ class EngineCoreRequest( arrival_time: float lora_request: Optional[LoRARequest] cache_salt: Optional[str] + data_parallel_rank: Optional[int] # Index of the client, used to ensure outputs are sent back to the same # client for this request when scaling out the front-end. diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 0e369632156bd..61ea3c4c3dab4 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -229,6 +229,7 @@ class AsyncLLM(EngineClient): trace_headers: Optional[Mapping[str, str]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, priority: int = 0, + data_parallel_rank: Optional[int] = None, ) -> RequestOutputCollector: """Add new request to the AsyncLLM.""" @@ -245,7 +246,7 @@ class AsyncLLM(EngineClient): prompt_str, request = self.processor.process_inputs( request_id, prompt, params, arrival_time, lora_request, tokenization_kwargs, trace_headers, prompt_adapter_request, - priority) + priority, data_parallel_rank) if params.n == 1: await self._add_request(request, prompt_str, None, 0, queue) @@ -291,6 +292,7 @@ class AsyncLLM(EngineClient): trace_headers: Optional[Mapping[str, str]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, priority: int = 0, + data_parallel_rank: Optional[int] = None, ) -> AsyncGenerator[RequestOutput, None]: """ Main function called by the API server to kick off a request @@ -321,6 +323,7 @@ class AsyncLLM(EngineClient): trace_headers=trace_headers, prompt_adapter_request=prompt_adapter_request, priority=priority, + data_parallel_rank=data_parallel_rank, ) # The output_handler task pushes items into the queue. diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index adb0709c828a7..0cd58d01df7f7 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -982,7 +982,16 @@ class DPAsyncMPClient(AsyncMPClient): resources.stats_update_task = asyncio.create_task( run_engine_stats_update_task()) - def get_core_engine_for_request(self) -> CoreEngine: + def get_core_engine_for_request(self, + dp_rank: Optional[int] = None + ) -> CoreEngine: + if dp_rank is not None: + # engines are already in rank order + if dp_rank < 0 or dp_rank >= len(self.core_engines): + raise ValueError(f"Requested DP rank {dp_rank} is out of " + f"range [0, {len(self.core_engines)})") + return self.core_engines[dp_rank] + if not self.lb_engines: return self.core_engines[0] # TODO use P2C alg for larger DP sizes @@ -1018,7 +1027,8 @@ class DPAsyncMPClient(AsyncMPClient): request.current_wave = self.current_wave request.client_index = self.client_index - chosen_engine = self.get_core_engine_for_request() + chosen_engine = self.get_core_engine_for_request( + request.data_parallel_rank) self.reqs_in_flight[request.request_id] = chosen_engine to_await = self._send_input(EngineCoreRequestType.ADD, request, diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 5c0d01d9b6f61..546fc98d681c6 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -212,6 +212,7 @@ class Processor: trace_headers: Optional[Mapping[str, str]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, priority: int = 0, + data_parallel_rank: Optional[int] = None, ) -> tuple[Optional[str], EngineCoreRequest]: # TODO(woosuk): Support pooling models. @@ -328,6 +329,7 @@ class Processor: arrival_time=arrival_time, lora_request=lora_request, cache_salt=decoder_inputs.get("cache_salt"), + data_parallel_rank=data_parallel_rank, ) def _validate_model_inputs(self, From d459fae0a2c464e28680bc6d564c1de1b295029e Mon Sep 17 00:00:00 2001 From: Tyler Michael Smith Date: Wed, 4 Jun 2025 11:39:23 -0400 Subject: [PATCH 057/115] [Bugfix][EP+DP] Fix internode check (#19112) Signed-off-by: Tyler Michael Smith --- vllm/distributed/device_communicators/all2all.py | 6 ------ .../device_communicators/base_device_communicator.py | 3 +-- 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/vllm/distributed/device_communicators/all2all.py b/vllm/distributed/device_communicators/all2all.py index 2ab3779ece056..cab2496bfba78 100644 --- a/vllm/distributed/device_communicators/all2all.py +++ b/vllm/distributed/device_communicators/all2all.py @@ -84,10 +84,6 @@ class PPLXAll2AllManager(All2AllManagerBase): assert has_pplx, "pplx_kernels not found. Please follow https://github.com/vllm-project/vllm/blob/main/tools/ep_kernels/README.md to install pplx_kernels." # noqa super().__init__(cpu_group) - # TODO(tms): Disable pplx-a2a intranode as it fails with the error: - # failed: cuda error /app/pplx/csrc/all_to_all/intranode.cpp:84 'invalid resource handle' # noqa - self.internode = True - if self.internode: # inter-node communication needs nvshmem, # intra-node communication uses p2p mapping directly @@ -178,7 +174,6 @@ class DeepEPHTAll2AllManager(DeepEPAll2AllManagerBase): num_rdma_bytes = 1024 * 1024 * 1024 num_qps_per_rank = self.num_sms // 2 else: - assert self.intranode num_rdma_bytes = 0 num_qps_per_rank = 1 @@ -243,7 +238,6 @@ class DeepEPLLAll2AllManager(DeepEPAll2AllManagerBase): if self.internode: num_rdma_bytes = 1024 * 1024 * 1024 else: - assert self.intranode num_rdma_bytes = deep_ep.Buffer.get_low_latency_rdma_size_hint( num_max_dispatch_tokens_per_rank=max_num_tokens_per_dp_rank, hidden=token_hidden_size, diff --git a/vllm/distributed/device_communicators/base_device_communicator.py b/vllm/distributed/device_communicators/base_device_communicator.py index 38370d4dc2b51..1bc2d8e0281c7 100644 --- a/vllm/distributed/device_communicators/base_device_communicator.py +++ b/vllm/distributed/device_communicators/base_device_communicator.py @@ -49,8 +49,7 @@ class All2AllManagerBase: # all2all communication often has separate implementations for # intra-node and inter-node communication - self.intranode = in_the_same_node_as(cpu_group, source_rank=0) - self.internode = not self.intranode + self.internode = not all(in_the_same_node_as(cpu_group, source_rank=0)) def get_handle(self, kwargs): # get a handle for the all2all communication, From 53a5a0ce30dd623808ebd02947e5183f918b6c2f Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Wed, 4 Jun 2025 13:46:28 -0400 Subject: [PATCH 058/115] [Perf] Tunings for SM100 FP8 CUTLASS kernel (#18778) Signed-off-by: mgoin --- .../c3x/scaled_mm_sm100_fp8_dispatch.cuh | 53 ++++++++++++++++++- 1 file changed, 51 insertions(+), 2 deletions(-) diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh index 468b77d9593bc..6da2da6340759 100644 --- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh +++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh @@ -15,6 +15,7 @@ using c3x::cutlass_gemm_caller; template typename Epilogue> struct sm100_fp8_config_default { + // M in (128, inf) static_assert(std::is_same()); using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto; using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto; @@ -25,6 +26,34 @@ struct sm100_fp8_config_default { KernelSchedule, EpilogueSchedule>; }; +template typename Epilogue> +struct sm100_fp8_config_M128 { + // M in (64, 128] + static_assert(std::is_same()); + using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto; + using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto; + using TileShape = Shape<_128, _128, _64>; + using ClusterShape = Shape<_2, _2, _1>; + using Cutlass3xGemm = + cutlass_3x_gemm_sm100; +}; + +template typename Epilogue> +struct sm100_fp8_config_M64 { + // M in [1, 64] + static_assert(std::is_same()); + using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto; + using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto; + using TileShape = Shape<_64, _64, _256>; + using ClusterShape = Shape<_1, _8, _1>; + using Cutlass3xGemm = + cutlass_3x_gemm_sm100; +}; + template typename Epilogue, typename... EpilogueArgs> @@ -39,8 +68,28 @@ inline void cutlass_gemm_sm100_fp8_dispatch(torch::Tensor& out, using Cutlass3xGemmDefault = typename sm100_fp8_config_default::Cutlass3xGemm; - return cutlass_gemm_caller( - out, a, b, std::forward(args)...); + using Cutlass3xGemmM64 = + typename sm100_fp8_config_M64::Cutlass3xGemm; + using Cutlass3xGemmM128 = + typename sm100_fp8_config_M128::Cutlass3xGemm; + + uint32_t const m = a.size(0); + uint32_t const mp2 = + std::max(static_cast(64), next_pow_2(m)); // next power of 2 + + if (mp2 <= 64) { + // m in [1, 64] + return cutlass_gemm_caller( + out, a, b, std::forward(args)...); + } else if (mp2 <= 128) { + // m in (64, 128] + return cutlass_gemm_caller( + out, a, b, std::forward(args)...); + } else { + // m in (128, inf) + return cutlass_gemm_caller( + out, a, b, std::forward(args)...); + } } template