From 5ac3168ee342f4cae17b0b67375e647bd5dd9151 Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Fri, 25 Jul 2025 23:52:42 +0800 Subject: [PATCH 001/182] [Docs] add auto-round quantization readme (#21600) Signed-off-by: Wenhua Cheng Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- docs/features/quantization/README.md | 1 + docs/features/quantization/auto_round.md | 103 +++++++++++++++++++++++ 2 files changed, 104 insertions(+) create mode 100644 docs/features/quantization/auto_round.md diff --git a/docs/features/quantization/README.md b/docs/features/quantization/README.md index e8c3b11230786..e18c128f30fc9 100644 --- a/docs/features/quantization/README.md +++ b/docs/features/quantization/README.md @@ -6,6 +6,7 @@ Contents: - [Supported Hardware](supported_hardware.md) - [AutoAWQ](auto_awq.md) +- [AutoRound](auto_round.md) - [BitsAndBytes](bnb.md) - [BitBLAS](bitblas.md) - [GGUF](gguf.md) diff --git a/docs/features/quantization/auto_round.md b/docs/features/quantization/auto_round.md new file mode 100644 index 0000000000000..2dfd847bb7d9a --- /dev/null +++ b/docs/features/quantization/auto_round.md @@ -0,0 +1,103 @@ +# AutoRound + +[AutoRound](https://github.com/intel/auto-round) is Intel’s advanced quantization algorithm designed to produce highly efficient **INT2, INT3, INT4, and INT8** +quantized large language models—striking an optimal balance between accuracy and deployment performance. + +AutoRound applies weight-only quantization to transformer-based models, enabling significant memory savings and faster +inference while maintaining near-original accuracy. It supports a wide range of hardware platforms, including **CPUs, +Intel GPUs, HPUs, and CUDA-enabled devices**. + +Please refer to the [AutoRound guide](https://github.com/intel/auto-round/blob/main/docs/step_by_step.md) for more details. + +Key Features: + +✅ **AutoRound, AutoAWQ, AutoGPTQ, and GGUF** are supported + +✅ **10+ vision-language models (VLMs)** are supported + +✅ **Per-layer mixed-bit quantization** for fine-grained control + +✅ **RTN (Round-To-Nearest) mode** for quick quantization with slight accuracy loss + +✅ **Multiple quantization recipes**: best, base, and light + +✅ Advanced utilities such as immediate packing and support for **10+ backends** + +## Installation + +```bash +uv pip install auto-round +``` + +## Quantizing a model + +For VLMs, please change to `auto-round-mllm` in CLI usage and `AutoRoundMLLM` in API usage. + +### CLI usage + +```bash +auto-round \ + --model Qwen/Qwen3-0.6B \ + --bits 4 \ + --group_size 128 \ + --format "auto_round" \ + --output_dir ./tmp_autoround +``` + +```bash +auto-round \ + --model Qwen/Qwen3-0.6B \ + --format "gguf:q4_k_m" \ + --output_dir ./tmp_autoround +``` + +### API usage + +```python +from transformers import AutoModelForCausalLM, AutoTokenizer +from auto_round import AutoRound + +model_name = "Qwen/Qwen3-0.6B" +model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto") +tokenizer = AutoTokenizer.from_pretrained(model_name) + +bits, group_size, sym = 4, 128, True +autoround = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym) + +# the best accuracy, 4-5X slower, low_gpu_mem_usage could save ~20G but ~30% slower +# autoround = AutoRound(model, tokenizer, nsamples=512, iters=1000, low_gpu_mem_usage=True, bits=bits, group_size=group_size, sym=sym) + +# 2-3X speedup, slight accuracy drop at W4G128 +# autoround = AutoRound(model, tokenizer, nsamples=128, iters=50, lr=5e-3, bits=bits, group_size=group_size, sym=sym ) + +output_dir = "./tmp_autoround" +# format= 'auto_round'(default), 'auto_gptq', 'auto_awq' +autoround.quantize_and_save(output_dir, format="auto_round") +``` + +## Running a quantized model with vLLM + +Here is some example code to run auto-round format in vLLM: + +```python +from vllm import LLM, SamplingParams + +prompts = [ + "Hello, my name is", +] +sampling_params = SamplingParams(temperature=0.6, top_p=0.95) +model_name = "Intel/DeepSeek-R1-0528-Qwen3-8B-int4-AutoRound" +llm = LLM(model=model_name) + +outputs = llm.generate(prompts, sampling_params) + +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") +``` + +# Acknowledgement + +Special thanks to open-source low precision libraries such as AutoGPTQ, AutoAWQ, GPTQModel, Triton, Marlin, and +ExLLaMAV2 for providing low-precision CUDA kernels, which are leveraged in AutoRound. From 7cfea0df390c154c1026f77d3682e2733ca4aca8 Mon Sep 17 00:00:00 2001 From: QiliangCui Date: Fri, 25 Jul 2025 13:22:01 -0700 Subject: [PATCH 002/182] [TPU][Test] Rollback PR-21550. (#21619) Signed-off-by: Qiliang Cui --- tests/v1/tpu/test_basic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/v1/tpu/test_basic.py b/tests/v1/tpu/test_basic.py index dd89059ded524..865b58bc7f4b0 100644 --- a/tests/v1/tpu/test_basic.py +++ b/tests/v1/tpu/test_basic.py @@ -59,7 +59,7 @@ def test_basic( # actually test chunked prompt max_num_batched_tokens=1024, max_model_len=8192, - gpu_memory_utilization=0.95, + gpu_memory_utilization=0.7, max_num_seqs=max_num_seqs, tensor_parallel_size=tensor_parallel_size) as vllm_model: vllm_outputs = vllm_model.generate_greedy(example_prompts, From 41d3082c416897092bc924bc341e86b3e49728ee Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Fri, 25 Jul 2025 17:06:48 -0700 Subject: [PATCH 003/182] Add Unsloth to RLHF.md (#21636) --- docs/training/rlhf.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/docs/training/rlhf.md b/docs/training/rlhf.md index 4f75e4e01495c..f608a630ab7a5 100644 --- a/docs/training/rlhf.md +++ b/docs/training/rlhf.md @@ -2,10 +2,14 @@ Reinforcement Learning from Human Feedback (RLHF) is a technique that fine-tunes language models using human-generated preference data to align model outputs with desired behaviors. -vLLM can be used to generate the completions for RLHF. The best way to do this is with libraries like [TRL](https://github.com/huggingface/trl), [OpenRLHF](https://github.com/OpenRLHF/OpenRLHF) and [verl](https://github.com/volcengine/verl). +vLLM can be used to generate the completions for RLHF. Some ways to do this include using libraries like [TRL](https://github.com/huggingface/trl), [OpenRLHF](https://github.com/OpenRLHF/OpenRLHF), [verl](https://github.com/volcengine/verl) and [unsloth](https://github.com/unslothai/unsloth). See the following basic examples to get started if you don't want to use an existing library: - [Training and inference processes are located on separate GPUs (inspired by OpenRLHF)](../examples/offline_inference/rlhf.md) - [Training and inference processes are colocated on the same GPUs using Ray](../examples/offline_inference/rlhf_colocate.md) - [Utilities for performing RLHF with vLLM](../examples/offline_inference/rlhf_utils.md) + +See the following notebooks showing how to use vLLM for GRPO: + +- [Qwen-3 4B GRPO using Unsloth + vLLM](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen3_(4B)-GRPO.ipynb) From 75d29cf4e1d7e950c2308b12e944b507fb3e1916 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Fri, 25 Jul 2025 20:07:07 -0400 Subject: [PATCH 004/182] [Perf] Cuda Kernel for Int8 Per Token Group Quant (#21476) Signed-off-by: yewentao256 --- csrc/ops.h | 5 +++++ .../compressed_tensors/int8_quant_kernels.cu | 10 ++++++++++ csrc/quantization/fp8/per_token_group_quant.cu | 6 +++++- csrc/quantization/per_token_group_quant_8bit.h | 10 ++++++++++ csrc/torch_bindings.cpp | 8 ++++++++ .../layers/quantization/utils/int8_utils.py | 11 +++++++++-- 6 files changed, 47 insertions(+), 3 deletions(-) create mode 100644 csrc/quantization/per_token_group_quant_8bit.h diff --git a/csrc/ops.h b/csrc/ops.h index 97a247d9d628c..207291eceb169 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -292,6 +292,11 @@ void per_token_group_quant_fp8(const torch::Tensor& input, torch::Tensor& output_q, torch::Tensor& output_s, int64_t group_size, double eps, double fp8_min, double fp8_max, bool scale_ue8m0); + +void per_token_group_quant_int8(const torch::Tensor& input, + torch::Tensor& output_q, + torch::Tensor& output_s, int64_t group_size, + double eps, double int8_min, double int8_max); #endif void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input, diff --git a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu index 5cd2ac179768b..6a81f159f46ae 100644 --- a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu +++ b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu @@ -1,6 +1,8 @@ #include #include +#include "../per_token_group_quant_8bit.h" + #include #include "../../dispatch_utils.h" @@ -336,3 +338,11 @@ void dynamic_scaled_int8_quant( } }); } + +void per_token_group_quant_int8(const torch::Tensor& input, + torch::Tensor& output_q, + torch::Tensor& output_s, int64_t group_size, + double eps, double int8_min, double int8_max) { + per_token_group_quant_8bit(input, output_q, output_s, group_size, eps, + int8_min, int8_max); +} \ No newline at end of file diff --git a/csrc/quantization/fp8/per_token_group_quant.cu b/csrc/quantization/fp8/per_token_group_quant.cu index afc41faeca902..2609054f2072b 100644 --- a/csrc/quantization/fp8/per_token_group_quant.cu +++ b/csrc/quantization/fp8/per_token_group_quant.cu @@ -1,6 +1,8 @@ #include #include +#include "../per_token_group_quant_8bit.h" + #include #include @@ -120,7 +122,7 @@ void per_token_group_quant_8bit(const torch::Tensor& input, torch::Tensor& output_q, torch::Tensor& output_s, int64_t group_size, double eps, double min_8bit, double max_8bit, - bool scale_ue8m0 = false) { + bool scale_ue8m0) { TORCH_CHECK(input.is_contiguous()); TORCH_CHECK(output_q.is_contiguous()); @@ -198,6 +200,8 @@ void per_token_group_quant_8bit(const torch::Tensor& input, input.scalar_type(), "per_token_group_quant_8bit", ([&] { if (dst_type == at::ScalarType::Float8_e4m3fn) { LAUNCH_KERNEL(scalar_t, c10::Float8_e4m3fn); + } else if (dst_type == at::ScalarType::Char) { + LAUNCH_KERNEL(scalar_t, int8_t); } })); diff --git a/csrc/quantization/per_token_group_quant_8bit.h b/csrc/quantization/per_token_group_quant_8bit.h new file mode 100644 index 0000000000000..537b61bc4303f --- /dev/null +++ b/csrc/quantization/per_token_group_quant_8bit.h @@ -0,0 +1,10 @@ +#pragma once +#include + +// TODO(wentao): refactor the folder to 8bit, then includes fp8 and int8 folders +// 8-bit per-token-group quantization helper used by both FP8 and INT8 +void per_token_group_quant_8bit(const torch::Tensor& input, + torch::Tensor& output_q, + torch::Tensor& output_s, int64_t group_size, + double eps, double min_8bit, double max_8bit, + bool scale_ue8m0 = false); \ No newline at end of file diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index 95f8541bc9e2d..85b6abef00b03 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -624,6 +624,14 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ops.impl("per_token_group_fp8_quant", torch::kCUDA, &per_token_group_quant_fp8); + // Compute per-token-group INT8 quantized tensor and scaling factor. + ops.def( + "per_token_group_quant_int8(Tensor input, Tensor! output_q, Tensor! " + "output_s, int group_size, float eps, float int8_min, float int8_max) -> " + "()"); + ops.impl("per_token_group_quant_int8", torch::kCUDA, + &per_token_group_quant_int8); + // reorder weight for AllSpark Ampere W8A16 Fused Gemm kernel ops.def( "rearrange_kn_weight_as_n32k16_order(Tensor b_qweight, Tensor b_scales, " diff --git a/vllm/model_executor/layers/quantization/utils/int8_utils.py b/vllm/model_executor/layers/quantization/utils/int8_utils.py index 1fdf7d174e25e..6840cabbf1ae3 100644 --- a/vllm/model_executor/layers/quantization/utils/int8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/int8_utils.py @@ -238,13 +238,20 @@ def per_token_group_quant_int8( int8_min = iinfo.min x_q = torch.empty_like(x, device=x.device, dtype=dtype) - M = x.numel() // group_size - N = group_size x_s = torch.empty( x.shape[:-1] + (x.shape[-1] // group_size, ), device=x.device, dtype=torch.float32, ) + # prefer CUDA kernel if available + if current_platform.is_cuda(): + torch.ops._C.per_token_group_quant_int8(x, x_q, x_s, group_size, eps, + float(int8_min), + float(int8_max)) + return x_q, x_s + + M = x.numel() // group_size + N = group_size BLOCK = triton.next_power_of_2(N) # heuristics for number of warps From 2eddd437ba5e7ce80d7341bf87a3078802b01ba7 Mon Sep 17 00:00:00 2001 From: Yong Hoon Shin <48474650+sarckk@users.noreply.github.com> Date: Fri, 25 Jul 2025 17:07:26 -0700 Subject: [PATCH 005/182] Add interleaved RoPE test for Llama4 (Maverick) (#21478) Signed-off-by: Yong Hoon Shin --- .../multimodal/generation/test_maverick.py | 92 +++++++++++++++---- 1 file changed, 73 insertions(+), 19 deletions(-) diff --git a/tests/models/multimodal/generation/test_maverick.py b/tests/models/multimodal/generation/test_maverick.py index 306cf39002df2..bacc9ef94f49d 100644 --- a/tests/models/multimodal/generation/test_maverick.py +++ b/tests/models/multimodal/generation/test_maverick.py @@ -22,6 +22,9 @@ from transformers import (AutoConfig, AutoProcessor, AutoTokenizer, GenerationConfig) from vllm import LLM, SamplingParams +from vllm.v1.executor.abstract import Executor +from vllm.v1.kv_cache_interface import (ChunkedLocalAttentionSpec, + FullAttentionSpec) from ....utils import multi_gpu_test @@ -69,6 +72,26 @@ def run_maverick_serving(model: str): raise +def get_rope_layers_config(model_path: str) -> list[int]: + """ + Get the interleaved RoPE configuration from HuggingFace config + + Args: + model_path: Path to the local directory containing the reduced + Maverick model checkpoint + + Returns: + List of 0 or 1 indicating whether each layer uses RoPE and local attn + 0 indicates that RoPE is not used while 1 indicates that RoPE is used. + """ + config_path = Path(model_path) / "config.json" + model_config = json.loads(config_path.read_text()) + text_config = model_config["text_config"] + no_rope_layers = text_config["no_rope_layers"] + print(f"Found no_rope_layers: {no_rope_layers}") + return no_rope_layers + + def create_reduced_maverick_model( original_model_name: str = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", @@ -113,7 +136,6 @@ def create_reduced_maverick_model( print("Loading original model configuration...") original_config = AutoConfig.from_pretrained(original_model_name, trust_remote_code=True) - print("Creating reduced configuration...") reduced_config = create_reduced_config(original_config, text_layers, num_experts, vision_layers) @@ -510,21 +532,32 @@ def save_weights_to_safetensors(weights: dict[str, torch.Tensor], f"{index_data['metadata']['total_size'] / (1024**3):.2f} GB") -def run_reduced_model(model_path: str, - should_profile: bool = False, - **kwargs) -> None: - """Test the created reduced model with vLLM.""" - - print(f"\nTesting reduced model at {model_path}...") - - llm = LLM( - model=model_path, - trust_remote_code=True, - max_model_len=512, # Small context for testing - gpu_memory_utilization=0.3, # Conservative memory usage - **kwargs, +def check_attention_spec_interleaved_rope( + llm: LLM, + num_attention_layers: int, + num_ranks: int, + rope_layers: list[int], +): + """Check that the attention spec is correct.""" + assert isinstance(llm.llm_engine.model_executor, Executor) + kv_cache_specs_per_rank = llm.llm_engine.model_executor.get_kv_cache_specs( ) + for rank in range(num_ranks): + kv_cache_specs = kv_cache_specs_per_rank[rank] + assert len(kv_cache_specs.keys()) == num_attention_layers + for i in range(num_attention_layers): + if rope_layers[i] == 0: + expected_spec = FullAttentionSpec + else: + expected_spec = ChunkedLocalAttentionSpec + assert isinstance( + kv_cache_specs[ + f"language_model.model.layers.{i}.self_attn.attn"], + expected_spec) + +def run_reduced_model(llm: LLM, should_profile: bool = False) -> None: + """Test the created reduced model with vLLM.""" sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=50) @@ -551,6 +584,7 @@ def run_reduced_model(model_path: str, @pytest.mark.parametrize("tp,ep", [(2, True)]) @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") def test_dummy_maverick( + monkeypatch, original_model_name: str, text_layers: int, num_experts: int, @@ -562,6 +596,10 @@ def test_dummy_maverick( force_recreate: bool = True, profile: bool = False, ) -> None: + # Disable multiprocessing allows us to access model executor from LLM engine + monkeypatch.setenv("VLLM_USE_V1", "1") + monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0") + model_path = create_reduced_maverick_model( original_model_name=original_model_name, output_dir=output_dir, @@ -573,11 +611,27 @@ def test_dummy_maverick( print(f"\nReduced model created successfully at: {model_path}") - run_reduced_model(model_path=model_path, - should_profile=profile, - enforce_eager=enforce_eager, - tensor_parallel_size=tp, - enable_expert_parallel=ep) + rope_layers = get_rope_layers_config(model_path) + + llm = LLM( + model=model_path, + trust_remote_code=True, + max_model_len=512, # Small context for testing + gpu_memory_utilization=0.3, # Conservative memory usage + enforce_eager=enforce_eager, + tensor_parallel_size=tp, + enable_expert_parallel=ep, + ) + + check_attention_spec_interleaved_rope( + llm, + text_layers, + tp, + rope_layers, + ) + + print(f"\nTesting reduced model at {model_path}...") + run_reduced_model(llm=llm, should_profile=profile) def main(): From cea96a015678c86789fa86a719ce7d6d176d78fd Mon Sep 17 00:00:00 2001 From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com> Date: Fri, 25 Jul 2025 17:07:58 -0700 Subject: [PATCH 006/182] [Bugfix] Fix sync_and_slice_intermediate_tensors (#21537) Signed-off-by: Rui Qiao --- vllm/v1/worker/gpu_model_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 5fe594db667a5..6ddb2c422dff7 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1270,7 +1270,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): if sync_self: assert intermediate_tensors is not None for k, v in intermediate_tensors.items(): - is_scattered = "residual" and is_residual_scattered + is_scattered = k == "residual" and is_residual_scattered copy_len = num_tokens // tp if is_scattered else \ num_tokens self.intermediate_tensors[k][:copy_len].copy_( From c7742d61134783b50098ab249f6815051a4c4a2a Mon Sep 17 00:00:00 2001 From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com> Date: Fri, 25 Jul 2025 17:08:30 -0700 Subject: [PATCH 007/182] [Bugfix] Always set RAY_ADDRESS for Ray actor before spawn (#21540) Signed-off-by: Rui Qiao --- vllm/utils/__init__.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index 9f4140ac64e2f..054037b8932b7 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -2883,26 +2883,27 @@ def _maybe_force_spawn(): if os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") == "spawn": return - reason = None - if cuda_is_initialized(): - reason = "CUDA is initialized" - elif xpu_is_initialized(): - reason = "XPU is initialized" - elif is_in_ray_actor(): + reasons = [] + if is_in_ray_actor(): # even if we choose to spawn, we need to pass the ray address # to the subprocess so that it knows how to connect to the ray cluster. # env vars are inherited by subprocesses, even if we use spawn. import ray os.environ["RAY_ADDRESS"] = ray.get_runtime_context().gcs_address - reason = "In a Ray actor and can only be spawned" + reasons.append("In a Ray actor and can only be spawned") - if reason is not None: + if cuda_is_initialized(): + reasons.append("CUDA is initialized") + elif xpu_is_initialized(): + reasons.append("XPU is initialized") + + if reasons: logger.warning( "We must use the `spawn` multiprocessing start method. " "Overriding VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. " "See https://docs.vllm.ai/en/latest/usage/" "troubleshooting.html#python-multiprocessing " - "for more information. Reason: %s", reason) + "for more information. Reasons: %s", "; ".join(reasons)) os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" From f1b286b2fbbde18745d57b0ce7ac4fbc56f10f0d Mon Sep 17 00:00:00 2001 From: Chengji Yao Date: Fri, 25 Jul 2025 17:09:00 -0700 Subject: [PATCH 008/182] [TPU] Update ptxla nightly version to 20250724 (#21555) Signed-off-by: Chengji Yao --- docker/Dockerfile.tpu | 2 +- requirements/tpu.txt | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/docker/Dockerfile.tpu b/docker/Dockerfile.tpu index 3474ff50de7bd..b9fc9def88190 100644 --- a/docker/Dockerfile.tpu +++ b/docker/Dockerfile.tpu @@ -1,4 +1,4 @@ -ARG NIGHTLY_DATE="20250714" +ARG NIGHTLY_DATE="20250724" ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.12_tpuvm_$NIGHTLY_DATE" FROM $BASE_IMAGE diff --git a/requirements/tpu.txt b/requirements/tpu.txt index d86f643d388ba..2d0d8bd8457e3 100644 --- a/requirements/tpu.txt +++ b/requirements/tpu.txt @@ -19,8 +19,8 @@ nixl==0.3.0 --find-links https://storage.googleapis.com/libtpu-releases/index.html --find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html -torch==2.9.0.dev20250716 -torchvision==0.24.0.dev20250716 -torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.9.0.dev20250716-cp311-cp311-linux_x86_64.whl ; python_version == "3.11" -torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.9.0.dev20250716-cp312-cp312-linux_x86_64.whl ; python_version == "3.12" +torch==2.9.0.dev20250724 +torchvision==0.24.0.dev20250724 +torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.9.0.dev20250724-cp311-cp311-linux_x86_64.whl ; python_version == "3.11" +torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.9.0.dev20250724-cp312-cp312-linux_x86_64.whl ; python_version == "3.12" From 7ae75fa6d02afc45637b060ce7a535a1bd547afd Mon Sep 17 00:00:00 2001 From: Alex Kogan <82225080+sakogan@users.noreply.github.com> Date: Fri, 25 Jul 2025 21:09:34 -0400 Subject: [PATCH 009/182] [Feature] Add support for MoE models in the calibration-free RTN-based quantization (#20766) Signed-off-by: Alex Kogan --- tests/quantization/test_rtn.py | 5 +- .../model_executor/layers/quantization/rtn.py | 234 +++++++++++++++--- 2 files changed, 201 insertions(+), 38 deletions(-) diff --git a/tests/quantization/test_rtn.py b/tests/quantization/test_rtn.py index 133b2d9e4df69..bc2b468f97d8c 100644 --- a/tests/quantization/test_rtn.py +++ b/tests/quantization/test_rtn.py @@ -8,7 +8,10 @@ import pytest from tests.quantization.utils import is_quant_method_supported -MODELS = ["microsoft/Phi-3-mini-4k-instruct"] +MODELS = [ + "microsoft/Phi-3-mini-4k-instruct", # dense model + "ai21labs/Jamba-tiny-dev", # MoE model +] @pytest.mark.skipif(not is_quant_method_supported("rtn"), diff --git a/vllm/model_executor/layers/quantization/rtn.py b/vllm/model_executor/layers/quantization/rtn.py index 68309716cf901..cceaf9857c40f 100644 --- a/vllm/model_executor/layers/quantization/rtn.py +++ b/vllm/model_executor/layers/quantization/rtn.py @@ -3,18 +3,19 @@ # Copyright © 2025, Oracle and/or its affiliates. import os -from typing import Any, Optional +from typing import Any, Callable, Optional import torch import torch.nn.functional as F from torch.nn.parameter import Parameter from vllm.logger import init_logger +from vllm.model_executor.layers.fused_moe import FusedMoE, FusedMoEMethodBase from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase, set_weight_attrs) from vllm.model_executor.layers.quantization import QuantizationMethods from vllm.model_executor.layers.quantization.base_config import ( - QuantizationConfig) + QuantizationConfig, QuantizeMethodBase) logger = init_logger(__name__) """By default, use 8 bit as target precision, but it can be @@ -71,9 +72,11 @@ class RTNConfig(QuantizationConfig): return cls(weight_bits, group_size) def get_quant_method(self, layer: torch.nn.Module, - prefix: str) -> Optional["RTNLinearMethod"]: + prefix: str) -> Optional["QuantizeMethodBase"]: if isinstance(layer, LinearBase): return RTNLinearMethod(self) + elif isinstance(layer, FusedMoE): + return RTNMoEMethod(self) return None @@ -94,11 +97,18 @@ class RTNTensor: self.data.narrow(dim, start // factor, length // factor), self.scale.narrow(dim, start, length), self.quant_config) + def __getitem__(self, key): + return RTNTensor(self.data[key], self.scale[key], self.quant_config) + @property def shape(self): shape = self.data.shape factor = 1 if self.quant_config.weight_bits == 8 else 2 - return torch.Size((shape[0] * factor, shape[1])) + batch_present = len(shape) == 3 + if batch_present: + return torch.Size((shape[0], shape[1] * factor, shape[2])) + else: + return torch.Size((shape[0] * factor, shape[1])) def copy_(self, loaded_weight: torch.Tensor) -> None: qweight, weight_scale = rtn_quantize(loaded_weight.cuda(), @@ -165,7 +175,7 @@ class RTNLinearMethod(LinearMethodBase): weight = RTNParameter(data=torch.empty(output_size_per_partition // factor, input_size_per_partition, - dtype=torch.int8), + dtype=torch.uint8), scale=scale, quant_config=self.quant_config) @@ -180,18 +190,7 @@ class RTNLinearMethod(LinearMethodBase): layer.output_size_per_partition = output_size_per_partition def process_weights_after_loading(self, layer: torch.nn.Module) -> None: - """torch.compile does not know how to deal with a Parameter subclass - (aka RTNParameter). As we don't really need RTNParameters for the - forward pass, we replace them with equivalent instances of Parameters. - """ - old_weight = layer.weight - assert isinstance(old_weight, RTNParameter) - data = old_weight.data.data - - delattr(layer, "weight") - - new_weight = Parameter(data=data, requires_grad=False) - layer.register_parameter("weight", new_weight) + fix_weights(layer, "weight") def apply(self, layer: torch.nn.Module, @@ -209,6 +208,128 @@ class RTNLinearMethod(LinearMethodBase): return out +class RTNMoEMethod(FusedMoEMethodBase): + + def __init__(self, quant_config: RTNConfig): + self.quant_config = quant_config + + def create_weights(self, layer: torch.nn.Module, num_experts: int, + hidden_size: int, intermediate_size_per_partition: int, + params_dtype: torch.dtype, **extra_weight_attrs): + + factor = 1 if self.quant_config.weight_bits == 8 else 2 + + # Fused gate_up_proj (column parallel) + num_groups_per_col = (hidden_size // self.quant_config.group_size + if self.quant_config.group_size != -1 else 1) + w13_scale = Parameter( + torch.empty(num_experts, + 2 * intermediate_size_per_partition, + num_groups_per_col, + dtype=params_dtype), + requires_grad=False, + ) + layer.register_parameter("w13_scale", w13_scale) + + w13_weight = RTNParameter(data=torch.empty( + num_experts, + 2 * intermediate_size_per_partition // factor, + hidden_size, + dtype=torch.uint8), + scale=w13_scale, + quant_config=self.quant_config) + layer.register_parameter("w13_weight", w13_weight) + set_weight_attrs(w13_weight, extra_weight_attrs) + + # down_proj (row parallel) + num_groups_per_col = (intermediate_size_per_partition // + self.quant_config.group_size + if self.quant_config.group_size != -1 else 1) + w2_scale = Parameter(torch.zeros(num_experts, + hidden_size, + num_groups_per_col, + dtype=params_dtype), + requires_grad=False) + layer.register_parameter("w2_scale", w2_scale) + + w2_weight = RTNParameter(data=torch.empty( + num_experts, + hidden_size // factor, + intermediate_size_per_partition, + dtype=torch.uint8), + scale=w2_scale, + quant_config=self.quant_config) + layer.register_parameter("w2_weight", w2_weight) + set_weight_attrs(w2_weight, extra_weight_attrs) + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + weight_bits = self.quant_config.weight_bits + fix_weights(layer, "w13_weight", weight_bits == 4) + fix_weights(layer, "w2_weight", weight_bits == 4) + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + router_logits: torch.Tensor, + top_k: int, + renormalize: bool, + use_grouped_topk: bool = False, + topk_group: Optional[int] = None, + num_expert_group: Optional[int] = None, + global_num_experts: int = -1, + expert_map: Optional[torch.Tensor] = None, + custom_routing_function: Optional[Callable] = None, + scoring_func: str = "softmax", + e_score_correction_bias: Optional[torch.Tensor] = None, + apply_router_weight_on_input: bool = False, + activation: str = "silu", + enable_eplb: bool = False, + expert_load_view: Optional[torch.Tensor] = None, + logical_to_physical_map: Optional[torch.Tensor] = None, + logical_replica_count: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + if enable_eplb: + raise NotImplementedError( + "EPLB not supported for `RTNMoEMethod` yet.") + + from vllm.model_executor.layers.fused_moe import fused_experts + + topk_weights, topk_ids = FusedMoE.select_experts( + hidden_states=x, + router_logits=router_logits, + use_grouped_topk=use_grouped_topk, + top_k=top_k, + renormalize=renormalize, + topk_group=topk_group, + num_expert_group=num_expert_group, + custom_routing_function=custom_routing_function, + scoring_func=scoring_func, + e_score_correction_bias=e_score_correction_bias) + + weight_bits = self.quant_config.weight_bits + group_size = self.quant_config.group_size + + ret = fused_experts( + x, + layer.w13_weight, + layer.w2_weight, + topk_weights=topk_weights, + topk_ids=topk_ids, + inplace=True, + activation=activation, + use_int4_w4a16=weight_bits == 4, + use_int8_w8a16=weight_bits == 8, + global_num_experts=global_num_experts, + w1_scale=layer.w13_scale, + w2_scale=layer.w2_scale, + apply_router_weight_on_input=apply_router_weight_on_input, + expert_map=expert_map, + block_shape=[0, group_size]) + + return ret + + def rtn_quantize(tensor: torch.Tensor, num_bits: int, group_size: int) -> tuple[torch.Tensor, torch.Tensor]: """Quantize a tensor using per-group static scaling factor. @@ -221,34 +342,44 @@ def rtn_quantize(tensor: torch.Tensor, num_bits: int, If equal to -1, each row in the input tensor is treated as one group. """ + batch_present = len(tensor.shape) == 3 + if not batch_present: + tensor = tensor.unsqueeze(0) q_range = 2**num_bits - num_groups = (tensor.shape[0] * tensor.shape[1] // - group_size if group_size != -1 else tensor.shape[0]) + num_groups = (tensor.shape[1] * tensor.shape[2] // + group_size if group_size != -1 else tensor.shape[1]) """Calculate a scaling factor per input group. """ - input_flat = tensor.reshape(num_groups, -1) - input_min = torch.min(input_flat, dim=1, keepdim=True)[0] - input_max = torch.max(input_flat, dim=1, keepdim=True)[0] + input_flat = tensor.reshape(tensor.shape[0], num_groups, -1) + input_min = torch.min(input_flat, dim=2, keepdim=True)[0] + input_max = torch.max(input_flat, dim=2, keepdim=True)[0] input_max_abs = torch.max(input_min.abs(), input_max.abs()) scale = (input_max_abs * 2.0 / (q_range - 1)) - """Scale each input group, truncate and round to the nearest integer. + """Scale each input group, round to the nearest integer, shift + the range and truncate. """ scaled_input = input_flat / scale - scaled_input = scaled_input.clamp(-q_range // 2, q_range // 2 - 1) scaled_input = scaled_input.round() + scaled_input += q_range // 2 + scaled_input = scaled_input.clamp(0, q_range - 1) - scale = scale.reshape(tensor.shape[0], -1).contiguous() - inputs_q = scaled_input.reshape(tensor.shape).to(torch.int8) + scale = scale.reshape(tensor.shape[0], tensor.shape[1], -1).contiguous() + inputs_q = scaled_input.reshape(tensor.shape).to(torch.uint8) inputs_q = inputs_q.contiguous() if num_bits == 4: """Pack two 4-bit values into each byte. """ - inputs_q = (inputs_q[:, 1::2] << 4) | (inputs_q[:, ::2] & 0xf) - inputs_q = inputs_q.reshape(tensor.shape[0] // 2, tensor.shape[1]) + inputs_q = (inputs_q[:, :, 1::2] << 4) | (inputs_q[:, :, ::2] & 0xf) + inputs_q = inputs_q.reshape(tensor.shape[0], tensor.shape[1] // 2, + tensor.shape[2]) inputs_q = inputs_q.contiguous() + if not batch_present: + inputs_q = inputs_q.squeeze(0) + scale = scale.squeeze(0) + return inputs_q, scale @@ -259,31 +390,60 @@ def rtn_dequantize(tensor: torch.Tensor, scale: torch.Tensor) -> torch.Tensor: tensor: The input tensor. scale: The tensor with per-group scale factors. """ + batch_present = len(tensor.shape) == 3 + if not batch_present: + tensor = tensor.unsqueeze(0) + scale = scale.unsqueeze(0) - num_groups = scale.size(0) * scale.size(1) - input_dim, output_dim = tensor.shape + num_groups = scale.size(1) * scale.size(2) + batch, input_dim, output_dim = tensor.shape - num_bits = 8 if input_dim == scale.size(0) else 4 + num_bits = 8 if input_dim == scale.size(1) else 4 + q_range = 2**num_bits if num_bits == 4: input_dim *= 2 - data = torch.empty((input_dim, output_dim), + data = torch.empty((batch, input_dim, output_dim), dtype=scale.dtype, device=tensor.device) if num_bits == 8: data.copy_(tensor) + data -= q_range // 2 else: """Unpack two 4-bit values from each byte. """ - tensor = tensor.reshape(input_dim, output_dim // 2) + tensor = tensor.reshape(batch, input_dim, output_dim // 2) for i in range(2): - data[:, i::2] = (tensor << 4 * (1 - i)) >> 4 + data[:, :, i::2] = ((tensor << 4 * + (1 - i)) >> 4).to(torch.int8) - q_range // 2 """Scale each input group with its scaling factor. """ - scale = scale.reshape(num_groups, -1) - data = data.reshape(num_groups, -1) + scale = scale.reshape(batch, num_groups, -1) + data = data.reshape(batch, num_groups, -1) data = torch.mul(data, scale) - input_deq = data.reshape((input_dim, output_dim)).contiguous() + input_deq = data.reshape((batch, input_dim, output_dim)).contiguous() + if not batch_present: + input_deq = input_deq.squeeze(0) + return input_deq + + +def fix_weights(layer: torch.nn.Module, + param_name: str, + reshape: bool = False): + """torch.compile does not know how to deal with a Parameter subclass + (aka RTNParameter). As we don't really need RTNParameters for the + forward pass, we replace them with equivalent instances of Parameters. + """ + old_weight = getattr(layer, param_name) + assert isinstance(old_weight, RTNParameter) + data = old_weight.data.data + + delattr(layer, param_name) + + if reshape: + data = data.reshape(old_weight.shape[0], old_weight.shape[1] * 2, -1) + new_weight = Parameter(data=data, requires_grad=False) + layer.register_parameter(param_name, new_weight) From 62965de5fe8be8e3622952a9b5cda86973cf9c51 Mon Sep 17 00:00:00 2001 From: Farzad Abdolhosseini Date: Sat, 26 Jul 2025 04:12:31 +0300 Subject: [PATCH 010/182] [Model] Ultravox: Support Llama 4 and Gemma 3 backends (#17818) Signed-off-by: Farzad Abdolhosseini Signed-off-by: Patrick Li Co-authored-by: Patrick Li --- tests/models/registry.py | 2 ++ vllm/model_executor/models/registry.py | 1 + vllm/model_executor/models/ultravox.py | 38 +++++++++++++-------- vllm/transformers_utils/configs/ultravox.py | 22 +++++++----- 4 files changed, 39 insertions(+), 24 deletions(-) diff --git a/tests/models/registry.py b/tests/models/registry.py index 1800262ced67f..b41e432d738a7 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -221,6 +221,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { "fp8": "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8"}), # noqa: E501 "LLaMAForCausalLM": _HfExamplesInfo("decapoda-research/llama-7b-hf", is_available_online=False), + "Llama4ForCausalLM": _HfExamplesInfo("meta-llama/Llama-4-Scout-17B-16E-Instruct", # noqa: E501 + is_available_online=False), "MambaForCausalLM": _HfExamplesInfo("state-spaces/mamba-130m-hf"), "Mamba2ForCausalLM": _HfExamplesInfo("mistralai/Mamba-Codestral-7B-v0.1"), "FalconMambaForCausalLM": _HfExamplesInfo("tiiuae/falcon-mamba-7b-instruct"), # noqa: E501 diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 14a8ac7876f73..9b204fdcbe1a5 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -89,6 +89,7 @@ _TEXT_GENERATION_MODELS = { "JAISLMHeadModel": ("jais", "JAISLMHeadModel"), "JambaForCausalLM": ("jamba", "JambaForCausalLM"), "LlamaForCausalLM": ("llama", "LlamaForCausalLM"), + "Llama4ForCausalLM": ("llama4", "Llama4ForCausalLM"), # noqa: E501 # For decapoda-research/llama-* "LLaMAForCausalLM": ("llama", "LlamaForCausalLM"), "MambaForCausalLM": ("mamba", "MambaForCausalLM"), diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index 3697e3fd0cf43..a4569ccd5a845 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -39,9 +39,7 @@ from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, merge_multimodal_embeddings, merge_multimodal_embeddings_from_map) -_AUDIO_PLACEHOLDER_OVERRIDE = "<|reserved_special_token_0|>" -_AUDIO_PLACEHOLDER_TOKEN = 128002 -_AUDIO_TOKENS_PER_SECOND = 6.25 +_AUDIO_PLACEHOLDER_OVERRIDE = "<|audio|>" _MAX_ENCODER_BATCH_SIZE = 16 @@ -80,14 +78,15 @@ class UltravoxProcessingInfo(BaseProcessingInfo): sampling_rate: Optional[int] = None, **kwargs: object, ) -> ProcessorMixin: + config = self.ctx.model_config.hf_config hf_processor = self.ctx.get_hf_processor(**kwargs) # NOTE: Ultravox processing definition uses '<|eot_id|>' as the # placeholder that will cause confusion with the actual end of turn - # token, thus we override placeholder with a reserved special - # token. + # token, thus we override placeholder with a reserved token. hf_processor.audio_token_replacement = _AUDIO_PLACEHOLDER_OVERRIDE - hf_processor.audio_replacement_token_id = _AUDIO_PLACEHOLDER_TOKEN + hf_processor.audio_replacement_token_id = config.audio_token_index + return hf_processor def get_feature_extractor( @@ -274,7 +273,7 @@ class UltravoxProjector(nn.Module): else: self.act = get_act_fn(config.projector_act) - dim_out = config.text_config.hidden_size + dim_out = config.text_hidden_size self.linear_2 = nn.Linear(dim_mid, dim_out, bias=False) # Ultravox v0.4.1 and below use layer_norm after the second linear layer @@ -572,9 +571,14 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA): input_ids: torch.Tensor, multimodal_embeddings: Optional[MultiModalEmbeddings] = None, ) -> torch.Tensor: - inputs_embeds = self.language_model.get_input_embeddings(input_ids) - if multimodal_embeddings is not None \ - and len(multimodal_embeddings) != 0: + # The audio token index is not included in the embedding table + # We need to remove it before embedding lookup + safe_input_ids = input_ids.clone() + safe_input_ids[safe_input_ids == self.config.audio_token_index] = 0 + inputs_embeds = self.language_model.get_input_embeddings( + safe_input_ids) + if multimodal_embeddings is not None and len( + multimodal_embeddings) > 0: # TODO(ywang96): remove this block after v0 is deprecated. if not envs.VLLM_USE_V1: @@ -585,7 +589,7 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA): else: inputs_embeds = merge_multimodal_embeddings( input_ids, inputs_embeds, multimodal_embeddings, - _AUDIO_PLACEHOLDER_TOKEN) + self.config.audio_token_index) return inputs_embeds def forward(self, @@ -623,10 +627,14 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA): multimodal_embeddings) input_ids = None - hidden_states = self.language_model.model(input_ids, - positions, - intermediate_tensors, - inputs_embeds=inputs_embeds) + language_model = self.language_model + if hasattr(language_model, "language_model"): + language_model = language_model.language_model + + hidden_states = language_model.model(input_ids, + positions, + intermediate_tensors, + inputs_embeds=inputs_embeds) return hidden_states def compute_logits(self, hidden_states: torch.Tensor, diff --git a/vllm/transformers_utils/configs/ultravox.py b/vllm/transformers_utils/configs/ultravox.py index 62f63b02d49a4..87064cc12deda 100644 --- a/vllm/transformers_utils/configs/ultravox.py +++ b/vllm/transformers_utils/configs/ultravox.py @@ -45,6 +45,7 @@ class UltravoxConfig(transformers.PretrainedConfig): """ model_type = "ultravox" + audio_token = "<|audio|>" is_composition = False def __init__( @@ -80,29 +81,32 @@ class UltravoxConfig(transformers.PretrainedConfig): # Avoid circular import from vllm.transformers_utils.config import get_config - self.text_config = get_config(text_model_id, - trust_remote_code=False) + text_config_obj = get_config(text_model_id, + trust_remote_code=False) else: text_config = text_config or {} - self.text_config = transformers.CONFIG_MAPPING[text_config.get( + text_config_obj = transformers.CONFIG_MAPPING[text_config.get( "model_type", "llama")](**text_config) + inner_text_config = text_config_obj.get_text_config() + if audio_model_id is not None: # Avoid circular import from vllm.transformers_utils.config import get_config - self.audio_config = get_config(audio_model_id, - trust_remote_code=False) + audio_config = get_config(audio_model_id, trust_remote_code=False) else: audio_config = audio_config or {} - self.audio_config = transformers.CONFIG_MAPPING[audio_config.get( + audio_config = transformers.CONFIG_MAPPING[audio_config.get( "model_type", "whisper")](**audio_config) + self.text_config = text_config_obj + self.audio_config = audio_config self.text_model_lora_config = text_model_lora_config or {} self.audio_model_lora_config = audio_model_lora_config or {} - self.vocab_size = self.text_config.vocab_size - - self.initializer_range = self.text_config.initializer_range + self.vocab_size = inner_text_config.vocab_size + self.initializer_range = inner_text_config.initializer_range + self.text_hidden_size = inner_text_config.hidden_size super().__init__(**kwargs) From 97349fe2bc68de69550787135c1a8c6b85fc8d81 Mon Sep 17 00:00:00 2001 From: WeiQing Chen <40507679+david6666666@users.noreply.github.com> Date: Sat, 26 Jul 2025 09:37:32 +0800 Subject: [PATCH 011/182] [Docs] add offline serving multi-modal video input expamle Qwen2.5-VL (#21530) Signed-off-by: David Chen <530634352@qq.com> --- docs/features/multimodal_inputs.md | 64 ++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md index e820ace4f8fe7..e83dfdb11dadc 100644 --- a/docs/features/multimodal_inputs.md +++ b/docs/features/multimodal_inputs.md @@ -177,6 +177,70 @@ Multi-image input can be extended to perform video captioning. We show this with You can pass a list of NumPy arrays directly to the `'video'` field of the multi-modal dictionary instead of using multi-image input. +Instead of NumPy arrays, you can also pass `'torch.Tensor'` instances, as shown in this example using Qwen2.5-VL: + +??? code + + ```python + from transformers import AutoProcessor + from vllm import LLM, SamplingParams + from qwen_vl_utils import process_vision_info + + model_path = "Qwen/Qwen2.5-VL-3B-Instruct/" + video_path = "https://content.pexels.com/videos/free-videos.mp4" + + llm = LLM( + model=model_path, + gpu_memory_utilization=0.8, + enforce_eager=True, + limit_mm_per_prompt={"video": 1}, + ) + + sampling_params = SamplingParams( + max_tokens=1024, + ) + + video_messages = [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": [ + {"type": "text", "text": "describe this video."}, + { + "type": "video", + "video": video_path, + "total_pixels": 20480 * 28 * 28, + "min_pixels": 16 * 28 * 28 + } + ] + }, + ] + + messages = video_messages + processor = AutoProcessor.from_pretrained(model_path) + prompt = processor.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True, + ) + + image_inputs, video_inputs = process_vision_info(messages) + mm_data = {} + if video_inputs is not None: + mm_data["video"] = video_inputs + + llm_inputs = { + "prompt": prompt, + "multi_modal_data": mm_data, + } + + outputs = llm.generate([llm_inputs], sampling_params=sampling_params) + for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) + ``` + + !!! note + 'process_vision_info' is only applicable to Qwen2.5-VL and similar models. + Full example: ### Audio Inputs From a55c95096b3537edfbbb7a5eafae0b0475c5ef07 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Fri, 25 Jul 2025 19:06:21 -0700 Subject: [PATCH 012/182] Correctly kill vLLM processes after finishing serving benchmarks (#21641) Signed-off-by: Huy Do --- .../scripts/run-nightly-benchmarks.sh | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh index 4d01a314adc47..4162905bb3cc3 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh @@ -95,12 +95,14 @@ json2args() { } kill_gpu_processes() { - pkill -f python - pkill -f python3 - pkill -f tritonserver - pkill -f pt_main_thread - pkill -f text-generation - pkill -f lmdeploy + pkill -f '[p]ython' + pkill -f '[p]ython3' + pkill -f '[t]ritonserver' + pkill -f '[p]t_main_thread' + pkill -f '[t]ext-generation' + pkill -f '[l]mdeploy' + # vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445 + pkill -f '[V]LLM' while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do sleep 1 From 2f6e6b33fb0acf5331982c8e1ea620005e3a19ba Mon Sep 17 00:00:00 2001 From: Alexandre JUAN Date: Sat, 26 Jul 2025 05:11:10 +0200 Subject: [PATCH 013/182] [Bugfix] Fix isinstance check for tensor types in _load_prompt_embeds to use dtype comparison (#21612) Signed-off-by: Alexandre Juan --- vllm/entrypoints/openai/serving_engine.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 9d848679d5d98..71976fea1ee77 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -957,9 +957,11 @@ class OpenAIServing: def _load_and_validate_embed(embed: bytes) -> EmbedsPrompt: tensor = torch.load(io.BytesIO(base64.b64decode(embed)), weights_only=True) - assert isinstance( - tensor, - (torch.FloatTensor, torch.BFloat16Tensor, torch.HalfTensor)) + assert isinstance(tensor, torch.Tensor) and tensor.dtype in ( + torch.float32, + torch.bfloat16, + torch.float16, + ) if tensor.dim() > 2: tensor = tensor.squeeze(0) assert tensor.dim() == 2 From 7728dd77bb802e1876012eb264df4d2fa2fc6f3c Mon Sep 17 00:00:00 2001 From: QiliangCui Date: Fri, 25 Jul 2025 23:20:30 -0700 Subject: [PATCH 014/182] [TPU][Test] Divide TPU v1 Test into 2 parts. (#21431) --- .../hardware_ci/run-tpu-v1-test-part2.sh | 166 ++++++++++++++++++ .../scripts/hardware_ci/run-tpu-v1-test.sh | 12 -- 2 files changed, 166 insertions(+), 12 deletions(-) create mode 100755 .buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh new file mode 100755 index 0000000000000..d998c1f73b514 --- /dev/null +++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh @@ -0,0 +1,166 @@ +#!/bin/bash + +set -xu + + +remove_docker_container() { + docker rm -f tpu-test || true; + docker rm -f vllm-tpu || true; +} + +trap remove_docker_container EXIT + +# Remove the container that might not be cleaned up in the previous run. +remove_docker_container + +# Build the docker image. +docker build -f docker/Dockerfile.tpu -t vllm-tpu . + +# Set up cleanup. +cleanup_docker() { + # Get Docker's root directory + docker_root=$(docker info -f '{{.DockerRootDir}}') + if [ -z "$docker_root" ]; then + echo "Failed to determine Docker root directory." + exit 1 + fi + echo "Docker root directory: $docker_root" + # Check disk usage of the filesystem where Docker's root directory is located + disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//') + # Define the threshold + threshold=70 + if [ "$disk_usage" -gt "$threshold" ]; then + echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..." + # Remove dangling images (those that are not tagged and not used by any container) + docker image prune -f + # Remove unused volumes / force the system prune for old images as well. + docker volume prune -f && docker system prune --force --filter "until=72h" --all + echo "Docker images and volumes cleanup completed." + else + echo "Disk usage is below $threshold%. No cleanup needed." + fi +} +cleanup_docker + +# For HF_TOKEN. +source /etc/environment + +docker run --privileged --net host --shm-size=16G -it \ + -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \ + vllm-tpu /bin/bash -c ' +set -e # Exit immediately if a command exits with a non-zero status. +set -u # Treat unset variables as an error. + +echo "--- Starting script inside Docker container ---" + +# Create results directory +RESULTS_DIR=$(mktemp -d) +# If mktemp fails, set -e will cause the script to exit. +echo "Results will be stored in: $RESULTS_DIR" + +# Install dependencies +echo "--- Installing Python dependencies ---" +python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \ + && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \ + && python3 -m pip install --progress-bar off lm_eval[api]==0.4.4 \ + && python3 -m pip install --progress-bar off hf-transfer +echo "--- Python dependencies installed ---" +export VLLM_USE_V1=1 +export VLLM_XLA_CHECK_RECOMPILATION=1 +export VLLM_XLA_CACHE_PATH= +echo "Using VLLM V1" + +echo "--- Hardware Information ---" +# tpu-info +echo "--- Starting Tests ---" +set +e +overall_script_exit_code=0 + +# --- Test Definitions --- +# If a test fails, this function will print logs and will not cause the main script to exit. +run_test() { + local test_num=$1 + local test_name=$2 + local test_command=$3 + local log_file="$RESULTS_DIR/test_${test_num}.log" + local actual_exit_code + + echo "--- TEST_$test_num: Running $test_name ---" + + # Execute the test command. + eval "$test_command" > >(tee -a "$log_file") 2> >(tee -a "$log_file" >&2) + actual_exit_code=$? + + echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" # This goes to main log + echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" >> "$log_file" # Also to per-test log + + if [ "$actual_exit_code" -ne 0 ]; then + echo "TEST_$test_num ($test_name) FAILED with exit code $actual_exit_code." >&2 + echo "--- Log for failed TEST_$test_num ($test_name) ---" >&2 + if [ -f "$log_file" ]; then + cat "$log_file" >&2 + else + echo "Log file $log_file not found for TEST_$test_num ($test_name)." >&2 + fi + echo "--- End of log for TEST_$test_num ($test_name) ---" >&2 + return "$actual_exit_code" # Return the failure code + else + echo "TEST_$test_num ($test_name) PASSED." + return 0 # Return success + fi +} + +# Helper function to call run_test and update the overall script exit code +run_and_track_test() { + local test_num_arg="$1" + local test_name_arg="$2" + local test_command_arg="$3" + + # Run the test + run_test "$test_num_arg" "$test_name_arg" "$test_command_arg" + local test_specific_exit_code=$? + + # If the test failed, set the overall script exit code to 1 + if [ "$test_specific_exit_code" -ne 0 ]; then + # No need for extra echo here, run_test already logged the failure. + overall_script_exit_code=1 + fi +} + +# --- Actual Test Execution --- +run_and_track_test 1 "test_struct_output_generate.py" \ + "HF_HUB_DISABLE_XET=1 python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\"" +run_and_track_test 2 "test_moe_pallas.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py" +run_and_track_test 3 "test_lora.py" \ + "VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py" +run_and_track_test 4 "test_tpu_qkv_linear.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py" +run_and_track_test 5 "test_spmd_model_weight_loading.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py" +run_and_track_test 6 "test_kv_cache_update_kernel.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_kv_cache_update_kernel.py" + +# After all tests have been attempted, exit with the overall status. +if [ "$overall_script_exit_code" -ne 0 ]; then + echo "--- One or more tests FAILED. Overall script exiting with failure code 1. ---" +else + echo "--- All tests have completed and PASSED. Overall script exiting with success code 0. ---" +fi +exit "$overall_script_exit_code" +' # IMPORTANT: This is the closing single quote for the bash -c "..." command. Ensure it is present and correct. + +# Capture the exit code of the docker run command +DOCKER_RUN_EXIT_CODE=$? + +# The trap will run for cleanup. +# Exit the main script with the Docker run command's exit code. +if [ "$DOCKER_RUN_EXIT_CODE" -ne 0 ]; then + echo "Docker run command failed with exit code $DOCKER_RUN_EXIT_CODE." + exit "$DOCKER_RUN_EXIT_CODE" +else + echo "Docker run command completed successfully." + exit 0 +fi +# TODO: This test fails because it uses RANDOM_SEED sampling +# pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \ diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh index 5514d7770cff8..e565d4b246945 100755 --- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh +++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh @@ -150,18 +150,6 @@ run_and_track_test 9 "test_multimodal.py" \ "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py" run_and_track_test 10 "test_pallas.py" \ "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py" -run_and_track_test 11 "test_struct_output_generate.py" \ - "HF_HUB_DISABLE_XET=1 python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\"" -run_and_track_test 12 "test_moe_pallas.py" \ - "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py" -run_and_track_test 13 "test_lora.py" \ - "VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py" -run_and_track_test 14 "test_tpu_qkv_linear.py" \ - "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py" -run_and_track_test 15 "test_spmd_model_weight_loading.py" \ - "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py" -run_and_track_test 16 "test_kv_cache_update_kernel.py" \ - "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_kv_cache_update_kernel.py" # After all tests have been attempted, exit with the overall status. if [ "$overall_script_exit_code" -ne 0 ]; then From 875af38e01217f20827f0b4e1353b91c884b9d53 Mon Sep 17 00:00:00 2001 From: Lyu Han Date: Sat, 26 Jul 2025 19:14:04 +0800 Subject: [PATCH 015/182] Support Intern-S1 (#21628) Signed-off-by: Roger Wang Signed-off-by: Isotr0py <2037008807@qq.com> Signed-off-by: Isotr0py Co-authored-by: Your Name Co-authored-by: Roger Wang Co-authored-by: Isotr0py <2037008807@qq.com> Co-authored-by: Isotr0py --- docs/models/supported_models.md | 1 + examples/offline_inference/vision_language.py | 32 + .../vision_language_multi_image.py | 28 + tests/models/registry.py | 2 + vllm/model_executor/models/interns1.py | 711 ++++++++++++++++++ vllm/model_executor/models/interns1_vit.py | 421 +++++++++++ vllm/model_executor/models/registry.py | 1 + 7 files changed, 1196 insertions(+) create mode 100644 vllm/model_executor/models/interns1.py create mode 100644 vllm/model_executor/models/interns1_vit.py diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 0f3b730eabedc..3847fc15119fd 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -593,6 +593,7 @@ Specified using `--task generate`. | `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | ✅︎ | | `H2OVLChatModel` | H2OVL | T + IE+ | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎ | | `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | | ✅︎ | +| `InternS1ForConditionalGeneration` | Intern-S1 | T + IE+ | `internlm/Intern-S1`, etc. | | ✅︎ | ✅︎ | | `InternVLChatModel` | InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + IE+ + (VE+) | `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | ✅︎ | | `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + IE+ + VE+ | `Kwai-Keye/Keye-VL-8B-Preview` | | | ✅︎ | | `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I+ | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | | ✅︎ | diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index eb6b410848558..61f5525c6d7e7 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -468,6 +468,37 @@ def run_tarsier(questions: list[str], modality: str) -> ModelRequestData: ) +# Intern-S1 +def run_interns1(questions: list[str], modality: str) -> ModelRequestData: + assert modality == "image" + + model_name = "internlm/Intern-S1" + + engine_args = EngineArgs( + model=model_name, + trust_remote_code=True, + max_model_len=8192, + max_num_seqs=2, + limit_mm_per_prompt={modality: 1}, + enforce_eager=True, + ) + + placeholder = "" + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + messages = [ + [{"role": "user", "content": f"{placeholder}\n{question}"}] + for question in questions + ] + prompts = tokenizer.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) + + # InternVL def run_internvl(questions: list[str], modality: str) -> ModelRequestData: model_name = "OpenGVLab/InternVL3-2B" @@ -1303,6 +1334,7 @@ model_example_map = { "h2ovl_chat": run_h2ovl, "hyperclovax_seed_vision": run_hyperclovax_seed_vision, "idefics3": run_idefics3, + "interns1": run_interns1, "internvl_chat": run_internvl, "nemotron_vl": run_nemotron_vl, "keye_vl": run_keye_vl, diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py index 2e14fc807e104..e312a0953e9be 100644 --- a/examples/offline_inference/vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -253,6 +253,33 @@ def load_smolvlm(question: str, image_urls: list[str]) -> ModelRequestData: ) +def load_interns1(question: str, image_urls: list[str]) -> ModelRequestData: + model_name = "internlm/Intern-S1" + + engine_args = EngineArgs( + model=model_name, + trust_remote_code=True, + max_model_len=4096, + limit_mm_per_prompt={"image": len(image_urls)}, + ) + + placeholders = "\n".join( + f"Image-{i}: \n" for i, _ in enumerate(image_urls, start=1) + ) + messages = [{"role": "user", "content": f"{placeholders}\n{question}"}] + + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + prompt = tokenizer.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + + return ModelRequestData( + engine_args=engine_args, + prompt=prompt, + image_data=[fetch_image(url) for url in image_urls], + ) + + def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData: model_name = "OpenGVLab/InternVL2-2B" @@ -946,6 +973,7 @@ model_example_map = { "gemma3": load_gemma3, "h2ovl_chat": load_h2ovl, "idefics3": load_idefics3, + "interns1": load_interns1, "internvl_chat": load_internvl, "hyperclovax_seed_vision": load_hyperclovax_seed_vision, "keye_vl": load_keye_vl, diff --git a/tests/models/registry.py b/tests/models/registry.py index b41e432d738a7..0dc5aec8db12e 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -381,6 +381,8 @@ _MULTIMODAL_EXAMPLE_MODELS = { extras={"2B": "OpenGVLab/InternVL2-2B", "3.0": "OpenGVLab/InternVL3-1B"}, # noqa: E501 trust_remote_code=True), + "InternS1ForConditionalGeneration": _HfExamplesInfo("internlm/Intern-S1", + trust_remote_code=True), "Idefics3ForConditionalGeneration": _HfExamplesInfo("HuggingFaceM4/Idefics3-8B-Llama3", # noqa: E501 {"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"}), # noqa: E501 "KeyeForConditionalGeneration": _HfExamplesInfo("Kwai-Keye/Keye-VL-8B-Preview", # noqa: E501 diff --git a/vllm/model_executor/models/interns1.py b/vllm/model_executor/models/interns1.py new file mode 100644 index 0000000000000..36204e4c5953f --- /dev/null +++ b/vllm/model_executor/models/interns1.py @@ -0,0 +1,711 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# -------------------------------------------------------- +# InternS1 +# Copyright (c) 2025 Shanghai AI Lab +# Licensed under The MIT License [see LICENSE for details] +# -------------------------------------------------------- +from collections.abc import Iterable, Mapping, Sequence +from typing import Literal, Optional, TypedDict, Union + +import torch +import torch.nn as nn +from transformers import InternVLProcessor, PretrainedConfig +from transformers.activations import ACT2FN +from transformers.models.got_ocr2.image_processing_got_ocr2_fast import ( + GotOcr2ImageProcessorFast) + +from vllm.config import VllmConfig +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.models.interns1_vit import InternS1VisionModel +from vllm.model_executor.models.module_mapping import MultiModelKeys +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, + MultiModalKwargs, NestedTensors) +from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, + ImageSize, MultiModalDataItems) +from vllm.multimodal.processing import (BaseMultiModalProcessor, + BaseProcessingInfo, PromptReplacement, + PromptUpdate, PromptUpdateDetails) +from vllm.multimodal.profiling import BaseDummyInputsBuilder +from vllm.sequence import IntermediateTensors + +from .interfaces import (MultiModalEmbeddings, SupportsLoRA, + SupportsMultiModal, SupportsPP) +from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, + init_vllm_registered_model, maybe_prefix, + merge_multimodal_embeddings) + + +class InternS1MultiModalProjector(nn.Module): + + def __init__(self, config): + super().__init__() + self.layer_norm = nn.LayerNorm(config.vision_config.hidden_size * + int(1 / config.downsample_ratio)**2) + self.linear_1 = nn.Linear( + config.vision_config.hidden_size * + int(1 / config.downsample_ratio)**2, + config.text_config.hidden_size) + self.act = ACT2FN[config.projector_hidden_act] + self.linear_2 = nn.Linear(config.text_config.hidden_size, + config.text_config.hidden_size) + + def forward(self, image_features): + hidden_states = self.layer_norm(image_features) + hidden_states = self.linear_1(hidden_states) + hidden_states = self.act(hidden_states) + hidden_states = self.linear_2(hidden_states) + return hidden_states + + +class InternS1ImagePixelInputs(TypedDict): + type: Literal["pixel_values"] + pixel_values: torch.Tensor + """ + Shape: + `(batch_size * num_images * (1 + num_patches), num_channels, height, width)` + """ + + +class InternS1ImageEmbeddingInputs(TypedDict): + type: Literal["image_embeds"] + data: Union[torch.Tensor, list[torch.Tensor]] + """ + A tensor of shape `(num_images, total_image_feature_size, hidden_size)` + or a list of tensors of shape `(total_image_feature_size, hidden_size)` + + `hidden_size` must match the hidden size of language model backbone. + """ + + +InternS1ImageInputs = Union[InternS1ImagePixelInputs, + InternS1ImageEmbeddingInputs] + + +class InternS1VideoPixelInputs(TypedDict): + type: Literal["pixel_values_videos"] + pixel_values: torch.Tensor + """ + Shape: + `(batch_size * num_video * num_frames, num_channels, height, width)` + """ + + num_patches: torch.Tensor + """Shape: `(batch_size * num_images)`""" + + +class InternS1VideoEmbeddingInputs(TypedDict): + type: Literal["video_embeds"] + data: Union[torch.Tensor, list[torch.Tensor]] + """ + A tensor of shape `(num_videos, total_video_feature_size, hidden_size)` + or a list of tensors of shape `(total_video_feature_size, hidden_size)` + + `hidden_size` must match the hidden size of language model backbone. + """ + + +InternS1VideoInputs = Union[InternS1VideoPixelInputs, + InternS1VideoEmbeddingInputs] + + +def resolve_interns1_min_max_num( + min_dynamic_patch: int, + max_dynamic_patch: int, + dynamic_image_size: bool, + use_thumbnail: bool, +) -> tuple[int, int]: + min_dynamic_patch = min_dynamic_patch if dynamic_image_size else 1 + max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1 + + if use_thumbnail and max_dynamic_patch != 1: + max_dynamic_patch += 1 + + return min_dynamic_patch, max_dynamic_patch + + +def get_interns1_target_ratios( + min_num: int, + max_num: int, +) -> list[tuple[int, int]]: + target_ratios = {(i, j) + for n in range(min_num, max_num + 1) + for i in range(1, n + 1) + for j in range(1, n + 1) if min_num <= i * j <= max_num} + return sorted(target_ratios, key=lambda x: x[0] * x[1]) + + +class InternS1ProcessingInfo(BaseProcessingInfo): + """Basic image-only ProcessingInfo for InternS1-style models.""" + + def get_hf_processor(self, **kwargs: object) -> InternVLProcessor: + return self.ctx.get_hf_processor(InternVLProcessor, **kwargs) + + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"image": None} + + def get_num_image_tokens( + self, + *, + image_width: int, + image_height: int, + processor: Optional['GotOcr2ImageProcessorFast'] = None, + ) -> int: + if processor is None: + processor = self.get_hf_processor().image_processor + + if not isinstance(processor, GotOcr2ImageProcessorFast): + raise ValueError(f'GotOcr2ImageProcessorFast is expected but got ' + f'{type(processor)}') + num_image_patches = processor.get_number_of_image_tokens( + image_height, image_width, images_kwargs=dict()) + num_image_tokens = self.get_hf_processor( + ).image_seq_length * num_image_patches + return num_image_tokens + + def resolve_target_ratios(self, use_thumbnail: Optional[bool] = None): + image_processor = self.get_hf_processor().image_processor + min_dynamic_patch = image_processor.min_patches + max_dynamic_patch = image_processor.max_patches + # HF format's InternVL processor uses `crop_to_patches` which is + # equivalent to `use_thumbnail` in original format. + use_thumbnail = image_processor.crop_to_patches + dynamic_image_size = True + min_num, max_num = resolve_interns1_min_max_num( + min_dynamic_patch, + max_dynamic_patch, + dynamic_image_size, + use_thumbnail=use_thumbnail) + + return get_interns1_target_ratios(min_num, max_num) + + def get_image_size_with_most_features(self) -> ImageSize: + processor = self.get_hf_processor() + + hf_config = self.ctx.get_hf_config() + base_height, base_width = hf_config.vision_config.image_size + target_ratios = self.resolve_target_ratios() + + largest_feature_size, largest_feature_pinpoint = 0, None + for wr, hr in target_ratios: + width, height = base_width * wr, base_height * hr + + feat_size = self.get_num_image_tokens( + image_width=width, + image_height=height, + processor=processor.image_processor, + ) + if feat_size > largest_feature_size: + largest_feature_size = feat_size + largest_feature_pinpoint = ImageSize(width=width, + height=height) + + assert not (largest_feature_size == 0 or largest_feature_pinpoint + is None), ("Cannot have a largest feature size of 0!") + + return largest_feature_pinpoint + + def get_max_image_tokens(self) -> int: + processor = self.get_hf_processor() + target_width, target_height = self.get_image_size_with_most_features() + + return self.get_num_image_tokens( + image_width=target_width, + image_height=target_height, + processor=processor.image_processor, + ) + + +class InternS1DummyInputsBuilder(BaseDummyInputsBuilder[InternS1ProcessingInfo] + ): + """Basic image-only DummyInputsBuilder for InternS1-style models.""" + + def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: + num_images = mm_counts.get("image", 0) + image_token = self.info.get_hf_processor().image_token + + return image_token * num_images + + def get_dummy_mm_data( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> MultiModalDataDict: + target_width, target_height = \ + self.info.get_image_size_with_most_features() + num_images = mm_counts.get("image", 0) + + return { + "image": + self._get_dummy_images(width=target_width, + height=target_height, + num_images=num_images) + } + + +class InternS1MultiModalProcessor( + BaseMultiModalProcessor[InternS1ProcessingInfo]): + """ Basic image-only MultiModalProcessor for InternS1-style models.""" + + def _call_hf_processor( + self, + prompt: str, + mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object], + tok_kwargs: Mapping[str, object], + ) -> Mapping[str, NestedTensors]: + processed_outputs = super()._call_hf_processor( + prompt=prompt, + mm_data=mm_data, + mm_kwargs=mm_kwargs, + tok_kwargs=tok_kwargs, + ) + + hf_processor = self.info.get_hf_processor(**mm_kwargs) + image_token_id = hf_processor.image_token_id + + # Since there may be extra tokens in the feature placeholders, + # we need to pass the image token ID to the model to select the + # tokens to merge from the vision encoder outputs + processed_outputs["image_token_id"] = torch.tensor(image_token_id) + images = mm_data.get('images', None) + image_processor = self.info.get_hf_processor().image_processor + if images is not None: + image_inputs = image_processor(images=images) + image_num_patches = image_inputs.pop("num_patches") + if not isinstance(image_num_patches, list): + raise ValueError( + f'num_patches is supposed to be list, but got ' + f'{type(image_num_patches)}') + image_num_patches = torch.tensor(image_num_patches) + processed_outputs['image_num_patches'] = image_num_patches + + return processed_outputs + + def _get_mm_fields_config( + self, + hf_inputs: Mapping[str, NestedTensors], + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + + image_num_patches = hf_inputs.get("image_num_patches", torch.empty(0)) + num_images = len(image_num_patches) + + return dict( + pixel_values=MultiModalFieldConfig.flat_from_sizes( + "image", image_num_patches), + image_num_patches=MultiModalFieldConfig.batched("image"), + image_embeds=MultiModalFieldConfig.batched("image"), + image_token_id=MultiModalFieldConfig.shared("image", num_images), + ) + + def _get_prompt_updates( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, + ) -> Sequence[PromptUpdate]: + hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) + img_context_token = hf_processor.image_token + start_image_token = hf_processor.start_image_token + end_image_token = hf_processor.end_image_token + + def get_replacement(item_idx: int): + images = mm_items.get_items( + "image", (ImageEmbeddingItems, ImageProcessorItems)) + + if isinstance(images, ImageEmbeddingItems): + feature_size = images.get_feature_size(item_idx) + else: + image_size = images.get_image_size(item_idx) + feature_size = self.info.get_num_image_tokens( + image_width=image_size.width, + image_height=image_size.height, + processor=hf_processor.image_processor, + ) + + repl_features = img_context_token * feature_size + repl_full = start_image_token + repl_features + end_image_token + return PromptUpdateDetails.select_text(repl_full, + img_context_token) + + return [ + PromptReplacement( + modality="image", + target=img_context_token, + replacement=get_replacement, + ) + ] + + +@MULTIMODAL_REGISTRY.register_processor( + InternS1MultiModalProcessor, + info=InternS1ProcessingInfo, + dummy_inputs=InternS1DummyInputsBuilder) +class InternS1ForConditionalGeneration(nn.Module, SupportsMultiModal, + SupportsPP, SupportsLoRA): + + # To ensure correct weight loading and mapping. + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_prefix={ + "lm_head.": "language_model.lm_head.", + "model.language_model.": "language_model.model.", + "model.vision_tower.": "vision_tower.", + "model.multi_modal_projector.": "multi_modal_projector.", + }) + + @classmethod + def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]: + # transformers InternVLProcessor uses as the seperator + # refer to https://github.com/huggingface/transformers/blob/f90de364c2484c7c325bbe05befdcf487bd75b63/src/transformers/models/internvl/processing_internvl.py#L116 + if modality.startswith("image"): + return '' + if modality.startswith("video"): + return "