From 7311f74468d2ba4f89658aa0fedf3811f8769b30 Mon Sep 17 00:00:00 2001 From: Kebe Date: Fri, 25 Jul 2025 18:42:23 +0800 Subject: [PATCH 01/57] [Bugfix] GGUF: fix AttributeError: 'PosixPath' object has no attribute 'startswith' (#21579) Signed-off-by: Kebe --- vllm/transformers_utils/config.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 8d1f59e6eadf1..da475c3b50a39 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -584,7 +584,7 @@ def get_pooling_config_name(pooling_name: str) -> Union[str, None]: @cache -def get_sentence_transformer_tokenizer_config(model: str, +def get_sentence_transformer_tokenizer_config(model: Union[str, Path], revision: Optional[str] = 'main' ): """ @@ -592,7 +592,7 @@ def get_sentence_transformer_tokenizer_config(model: str, given Sentence Transformer BERT model. Parameters: - - model (str): The name of the Sentence Transformer + - model (str|Path): The name of the Sentence Transformer BERT model. - revision (str, optional): The revision of the m odel to use. Defaults to 'main'. @@ -620,7 +620,7 @@ def get_sentence_transformer_tokenizer_config(model: str, if encoder_dict: break - if not encoder_dict and not model.startswith("/"): + if not encoder_dict and not Path(model).is_absolute(): try: # If model is on HuggingfaceHub, get the repo files repo_files = list_repo_files(model, From 5c3f2628d5ddad55fd350ef733167fd7050e4ac6 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Fri, 25 Jul 2025 18:57:34 +0800 Subject: [PATCH 02/57] [Quantization] Enable BNB support for more MoE models (#21370) Signed-off-by: Jee Jee Li --- vllm/model_executor/models/dots1.py | 148 +++++++++++++------------ vllm/model_executor/models/glm4_moe.py | 25 +++-- 2 files changed, 93 insertions(+), 80 deletions(-) diff --git a/vllm/model_executor/models/dots1.py b/vllm/model_executor/models/dots1.py index 4bdcbfabbbc26..9b21a79446138 100644 --- a/vllm/model_executor/models/dots1.py +++ b/vllm/model_executor/models/dots1.py @@ -54,8 +54,8 @@ from vllm.model_executor.model_loader.weight_utils import ( from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors -from .interfaces import SupportsPP -from .utils import (PPMissingLayer, is_pp_missing_parameter, +from .interfaces import SupportsLoRA, SupportsPP +from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) @@ -327,6 +327,7 @@ class Dots1DecoderLayer(nn.Module): return hidden_states, residual +@support_torch_compile class Dots1Model(nn.Module): fall_back_to_pt_during_load = False @@ -404,68 +405,12 @@ class Dots1Model(nn.Module): hidden_states, _ = self.norm(hidden_states, residual) return hidden_states - -@support_torch_compile -class Dots1ForCausalLM(nn.Module, SupportsPP): - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__() - config = vllm_config.model_config.hf_config - quant_config = vllm_config.quant_config - self.config = config - self.quant_config = quant_config - self.model = Dots1Model(vllm_config=vllm_config, - prefix=maybe_prefix(prefix, "model")) - if get_pp_group().is_last_rank: - self.lm_head = ParallelLMHead(config.vocab_size, - config.hidden_size, - quant_config=quant_config) - else: - self.lm_head = PPMissingLayer() - self.logits_processor = LogitsProcessor(config.vocab_size) - self.make_empty_intermediate_tensors = ( - self.model.make_empty_intermediate_tensors) - - def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: - return self.model.get_input_embeddings(input_ids) - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - intermediate_tensors: Optional[IntermediateTensors] = None, - inputs_embeds: Optional[torch.Tensor] = None, - ) -> Union[torch.Tensor, IntermediateTensors]: - hidden_states = self.model( - input_ids, - positions, - intermediate_tensors, - inputs_embeds, - ) - return hidden_states - - def compute_logits( - self, - hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) - return logits - - def make_empty_intermediate_tensors( - self, batch_size: int, dtype: torch.dtype, - device: torch.device) -> IntermediateTensors: - return IntermediateTensors({ - "hidden_states": - torch.zeros((batch_size, self.config.hidden_size), - dtype=dtype, - device=device), - "residual": - torch.zeros((batch_size, self.config.hidden_size), - dtype=dtype, - device=device), - }) + def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: + return FusedMoE.make_expert_params_mapping( + ckpt_gate_proj_name="gate_proj", + ckpt_down_proj_name="down_proj", + ckpt_up_proj_name="up_proj", + num_experts=self.config.n_routed_experts) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: @@ -477,14 +422,9 @@ class Dots1ForCausalLM(nn.Module, SupportsPP): ("gate_up_proj", "up_proj", 1), ] - expert_params_mapping = FusedMoE.make_expert_params_mapping( - ckpt_gate_proj_name="gate_proj", - ckpt_down_proj_name="down_proj", - ckpt_up_proj_name="up_proj", - num_experts=self.config.n_routed_experts) - params_dict = dict(self.named_parameters()) loaded_params: set[str] = set() + expert_params_mapping = self.get_expert_mapping() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue @@ -534,3 +474,71 @@ class Dots1ForCausalLM(nn.Module, SupportsPP): weight_loader(param, loaded_weight) loaded_params.add(name) return loaded_params + + +class Dots1ForCausalLM(nn.Module, SupportsPP, SupportsLoRA): + + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], + } + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + self.config = config + self.quant_config = quant_config + self.model = Dots1Model(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) + if get_pp_group().is_last_rank: + self.lm_head = ParallelLMHead(config.vocab_size, + config.hidden_size, + quant_config=quant_config) + else: + self.lm_head = PPMissingLayer() + self.logits_processor = LogitsProcessor(config.vocab_size) + self.make_empty_intermediate_tensors = ( + self.model.make_empty_intermediate_tensors) + + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + ) -> Union[torch.Tensor, IntermediateTensors]: + hidden_states = self.model( + input_ids, + positions, + intermediate_tensors, + inputs_embeds, + ) + return hidden_states + + def compute_logits( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[torch.Tensor]: + logits = self.logits_processor(self.lm_head, hidden_states, + sampling_metadata) + return logits + + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + loader = AutoWeightsLoader(self) + return loader.load_weights(weights) + + def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: + return self.model.get_expert_mapping() diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py index 43824abb571a6..6a196fef572de 100644 --- a/vllm/model_executor/models/glm4_moe.py +++ b/vllm/model_executor/models/glm4_moe.py @@ -53,7 +53,7 @@ from vllm.model_executor.model_loader.weight_utils import ( from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors -from .interfaces import SupportsPP +from .interfaces import SupportsLoRA, SupportsPP from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) @@ -461,6 +461,15 @@ class Glm4MoeModel(nn.Module): device=device), }) + def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: + # Params for weights, fp8 weight scales, fp8 activation scales + # (param_name, weight_name, expert_id, shard_id) + return FusedMoE.make_expert_params_mapping( + ckpt_gate_proj_name="gate_proj", + ckpt_down_proj_name="down_proj", + ckpt_up_proj_name="up_proj", + num_experts=self.config.n_routed_experts) + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: stacked_params_mapping = [ @@ -472,16 +481,9 @@ class Glm4MoeModel(nn.Module): ("gate_up_proj", "up_proj", 1), ] - # Params for weights, fp8 weight scales, fp8 activation scales - # (param_name, weight_name, expert_id, shard_id) - expert_params_mapping = FusedMoE.make_expert_params_mapping( - ckpt_gate_proj_name="gate_proj", - ckpt_down_proj_name="down_proj", - ckpt_up_proj_name="up_proj", - num_experts=self.config.n_routed_experts) - params_dict = dict(self.named_parameters()) loaded_params: set[str] = set() + expert_params_mapping = self.get_expert_mapping() for name, loaded_weight in weights: spec_layer = get_spec_layer_idx_from_weight_name(self.config, name) if spec_layer is not None: @@ -570,7 +572,7 @@ class Glm4MoeModel(nn.Module): return loaded_params -class Glm4MoeForCausalLM(nn.Module, SupportsPP): +class Glm4MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA): packed_modules_mapping = { "qkv_proj": [ "q_proj", @@ -677,6 +679,9 @@ class Glm4MoeForCausalLM(nn.Module, SupportsPP): loader = AutoWeightsLoader(self) return loader.load_weights(weights) + def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: + return self.model.get_expert_mapping() + def get_spec_layer_idx_from_weight_name(config: PretrainedConfig, weight_name: str) -> Optional[int]: From 46d81d69511ac11f27d19af80d19dd7b2cce8613 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Fri, 25 Jul 2025 20:36:45 +0800 Subject: [PATCH 03/57] [V1] Get supported tasks from model runner instead of model config (#21585) Signed-off-by: DarkLight1337 --- vllm/entrypoints/llm.py | 24 +++++++++++----- vllm/entrypoints/openai/api_server.py | 32 +++++++++++++--------- vllm/entrypoints/openai/run_batch.py | 21 +++++++++----- vllm/executor/executor_base.py | 8 +++--- vllm/model_executor/layers/pooler.py | 3 +- vllm/model_executor/models/bert.py | 2 +- vllm/model_executor/models/gritlm.py | 2 +- vllm/model_executor/models/modernbert.py | 2 +- vllm/pooling_params.py | 5 ++-- vllm/tasks.py | 11 ++++++++ vllm/v1/engine/async_llm.py | 4 +++ vllm/v1/engine/core.py | 11 ++++++-- vllm/v1/engine/core_client.py | 16 +++++++++++ vllm/v1/engine/llm_engine.py | 4 +++ vllm/v1/worker/gpu_model_runner.py | 35 +++++++++++++++++++++--- vllm/v1/worker/gpu_worker.py | 6 ++-- vllm/v1/worker/tpu_model_runner.py | 31 +++++++++++++++++++-- vllm/v1/worker/tpu_worker.py | 6 ++-- vllm/worker/model_runner_base.py | 31 +++++++++++++++++++-- 19 files changed, 200 insertions(+), 54 deletions(-) create mode 100644 vllm/tasks.py diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 2f766a2dae57a..2c961156bc845 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -14,6 +14,7 @@ from pydantic import ValidationError from tqdm.auto import tqdm from typing_extensions import TypeVar, deprecated +import vllm.envs as envs from vllm.beam_search import (BeamSearchInstance, BeamSearchOutput, BeamSearchSequence, create_sort_beams_key_function) @@ -44,9 +45,10 @@ from vllm.model_executor.layers.quantization import QuantizationMethods from vllm.outputs import (ClassificationRequestOutput, EmbeddingRequestOutput, PoolingRequestOutput, RequestOutput, ScoringRequestOutput) -from vllm.pooling_params import PoolingParams, PoolingTask +from vllm.pooling_params import PoolingParams from vllm.sampling_params import (BeamSearchParams, GuidedDecodingParams, RequestOutputKind, SamplingParams) +from vllm.tasks import PoolingTask from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer, get_cached_tokenizer) from vllm.usage.usage_lib import UsageContext @@ -277,6 +279,16 @@ class LLM: self.request_counter = Counter() self.default_sampling_params: Union[dict[str, Any], None] = None + if envs.VLLM_USE_V1: + supported_tasks = self.llm_engine \ + .get_supported_tasks() # type: ignore + else: + supported_tasks = self.llm_engine.model_config.supported_tasks + + logger.info("Supported_tasks: %s", supported_tasks) + + self.supported_tasks = supported_tasks + def get_tokenizer( self, lora_request: Optional[LoRARequest] = None, @@ -1170,8 +1182,7 @@ class LLM: A list of `EmbeddingRequestOutput` objects containing the embedding vectors in the same order as the input prompts. """ - model_config = self.llm_engine.model_config - if "embed" not in model_config.supported_tasks: + if "embed" not in self.supported_tasks: raise ValueError("Embedding API is not supported by this model. " "Please set `--task embed`.") @@ -1215,8 +1226,7 @@ class LLM: A list of `ClassificationRequestOutput` objects containing the embedding vectors in the same order as the input prompts. """ - model_config = self.llm_engine.model_config - if "classify" not in model_config.supported_tasks: + if "classify" not in self.supported_tasks: raise ValueError( "Classification API is not supported by this model. " "Please set `--task classify`.") @@ -1397,8 +1407,8 @@ class LLM: raise ValueError(" ".join(messages)) - if all(t not in model_config.supported_tasks - for t in ("embed", "classify")): + supported_tasks = self.supported_tasks + if all(t not in supported_tasks for t in ("embed", "classify")): raise ValueError("Score API is not supported by this model. " "Please set `--task embed` or `--task classify`.") diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 8540d25d4e94d..5b87aed06e9ba 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -1586,6 +1586,14 @@ async def init_app_state( state.vllm_config = vllm_config model_config = vllm_config.model_config + if envs.VLLM_USE_V1: + supported_tasks = await engine_client \ + .get_supported_tasks() # type: ignore + else: + supported_tasks = model_config.supported_tasks + + logger.info("Supported_tasks: %s", supported_tasks) + resolved_chat_template = load_chat_template(args.chat_template) if resolved_chat_template is not None: # Get the tokenizer to check official template @@ -1647,7 +1655,7 @@ async def init_app_state( reasoning_parser=args.reasoning_parser, enable_prompt_tokens_details=args.enable_prompt_tokens_details, enable_force_include_usage=args.enable_force_include_usage, - ) if "generate" in model_config.supported_tasks else None + ) if "generate" in supported_tasks else None state.openai_serving_chat = OpenAIServingChat( engine_client, model_config, @@ -1664,7 +1672,7 @@ async def init_app_state( reasoning_parser=args.reasoning_parser, enable_prompt_tokens_details=args.enable_prompt_tokens_details, enable_force_include_usage=args.enable_force_include_usage, - ) if "generate" in model_config.supported_tasks else None + ) if "generate" in supported_tasks else None state.openai_serving_completion = OpenAIServingCompletion( engine_client, model_config, @@ -1673,7 +1681,7 @@ async def init_app_state( return_tokens_as_token_ids=args.return_tokens_as_token_ids, enable_prompt_tokens_details=args.enable_prompt_tokens_details, enable_force_include_usage=args.enable_force_include_usage, - ) if "generate" in model_config.supported_tasks else None + ) if "generate" in supported_tasks else None state.openai_serving_pooling = OpenAIServingPooling( engine_client, model_config, @@ -1681,7 +1689,7 @@ async def init_app_state( request_logger=request_logger, chat_template=resolved_chat_template, chat_template_content_format=args.chat_template_content_format, - ) if "encode" in model_config.supported_tasks else None + ) if "encode" in supported_tasks else None state.openai_serving_embedding = OpenAIServingEmbedding( engine_client, model_config, @@ -1689,24 +1697,22 @@ async def init_app_state( request_logger=request_logger, chat_template=resolved_chat_template, chat_template_content_format=args.chat_template_content_format, - ) if "embed" in model_config.supported_tasks else None + ) if "embed" in supported_tasks else None state.openai_serving_classification = ServingClassification( engine_client, model_config, state.openai_serving_models, request_logger=request_logger, - ) if "classify" in model_config.supported_tasks else None + ) if "classify" in supported_tasks else None - enable_serving_reranking = ("classify" in model_config.supported_tasks - and getattr(model_config.hf_config, - "num_labels", 0) == 1) + enable_serving_reranking = ("classify" in supported_tasks and getattr( + model_config.hf_config, "num_labels", 0) == 1) state.openai_serving_scores = ServingScores( engine_client, model_config, state.openai_serving_models, request_logger=request_logger, - ) if ("embed" in model_config.supported_tasks - or enable_serving_reranking) else None + ) if ("embed" in supported_tasks or enable_serving_reranking) else None state.openai_serving_tokenization = OpenAIServingTokenization( engine_client, @@ -1721,13 +1727,13 @@ async def init_app_state( model_config, state.openai_serving_models, request_logger=request_logger, - ) if "transcription" in model_config.supported_tasks else None + ) if "transcription" in supported_tasks else None state.openai_serving_translation = OpenAIServingTranslation( engine_client, model_config, state.openai_serving_models, request_logger=request_logger, - ) if "transcription" in model_config.supported_tasks else None + ) if "transcription" in supported_tasks else None state.task = model_config.task state.enable_server_load_tracking = args.enable_server_load_tracking diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py index 5770550923270..137b368dad202 100644 --- a/vllm/entrypoints/openai/run_batch.py +++ b/vllm/entrypoints/openai/run_batch.py @@ -14,6 +14,7 @@ import torch from prometheus_client import start_http_server from tqdm import tqdm +import vllm.envs as envs from vllm.config import VllmConfig from vllm.engine.arg_utils import AsyncEngineArgs, optional_type from vllm.engine.protocol import EngineClient @@ -335,6 +336,14 @@ async def run_batch( model_config = vllm_config.model_config + if envs.VLLM_USE_V1: + supported_tasks = await engine_client \ + .get_supported_tasks() # type: ignore + else: + supported_tasks = model_config.supported_tasks + + logger.info("Supported_tasks: %s", supported_tasks) + # Create the openai serving objects. openai_serving_models = OpenAIServingModels( engine_client=engine_client, @@ -351,7 +360,7 @@ async def run_batch( chat_template=None, chat_template_content_format="auto", enable_prompt_tokens_details=args.enable_prompt_tokens_details, - ) if "generate" in model_config.supported_tasks else None + ) if "generate" in supported_tasks else None openai_serving_embedding = OpenAIServingEmbedding( engine_client, model_config, @@ -359,19 +368,17 @@ async def run_batch( request_logger=request_logger, chat_template=None, chat_template_content_format="auto", - ) if "embed" in model_config.supported_tasks else None + ) if "embed" in supported_tasks else None - enable_serving_reranking = ("classify" in model_config.supported_tasks - and getattr(model_config.hf_config, - "num_labels", 0) == 1) + enable_serving_reranking = ("classify" in supported_tasks and getattr( + model_config.hf_config, "num_labels", 0) == 1) openai_serving_scores = ServingScores( engine_client, model_config, openai_serving_models, request_logger=request_logger, - ) if ("embed" in model_config.supported_tasks - or enable_serving_reranking) else None + ) if ("embed" in supported_tasks or enable_serving_reranking) else None tracker = BatchProgressTracker() logger.info("Reading batch from %s...", args.input_file) diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py index 483fdb1486f79..97d0d6f08b81e 100644 --- a/vllm/executor/executor_base.py +++ b/vllm/executor/executor_base.py @@ -16,8 +16,8 @@ from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.pooling_params import PoolingTask from vllm.sequence import ExecuteModelRequest, PoolerOutput +from vllm.tasks import SupportedTask from vllm.utils import make_async from vllm.worker.worker_base import WorkerBase @@ -136,9 +136,9 @@ class ExecutorBase(ABC): return self.collective_rpc(rpc_func) @cached_property # Avoid unnecessary RPC calls - def supported_pooling_tasks(self) -> tuple[PoolingTask, ...]: - output = self.collective_rpc("get_supported_pooling_tasks") - return tuple({task for tasks in output for task in tasks}) + def supported_tasks(self) -> tuple[SupportedTask, ...]: + output = self.collective_rpc("get_supported_tasks") + return output[0] def execute_model( self, execute_model_req: ExecuteModelRequest diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py index c06cca080227e..5bfd4aaccc17c 100644 --- a/vllm/model_executor/layers/pooler.py +++ b/vllm/model_executor/layers/pooler.py @@ -16,8 +16,9 @@ from vllm.config import ModelConfig, PoolerConfig from vllm.model_executor.pooling_metadata import ( # noqa: E501 PoolingMetadata as V0PoolingMetadata) from vllm.model_executor.pooling_metadata import PoolingTensors -from vllm.pooling_params import PoolingParams, PoolingTask +from vllm.pooling_params import PoolingParams from vllm.sequence import PoolerOutput, PoolingSequenceGroupOutput +from vllm.tasks import PoolingTask from vllm.utils import resolve_obj_by_qualname from vllm.v1.pool.metadata import PoolingMetadata as V1PoolingMetadata diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py index 9dc6115f850ec..c3066aaa2b87d 100644 --- a/vllm/model_executor/models/bert.py +++ b/vllm/model_executor/models/bert.py @@ -26,8 +26,8 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.model_executor.pooling_metadata import PoolingMetadata -from vllm.pooling_params import PoolingTask from vllm.sequence import IntermediateTensors +from vllm.tasks import PoolingTask from .interfaces import SupportsCrossEncoding, SupportsQuant, SupportsV0Only from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix diff --git a/vllm/model_executor/models/gritlm.py b/vllm/model_executor/models/gritlm.py index 8a3fbc6a49f04..c99970284a953 100644 --- a/vllm/model_executor/models/gritlm.py +++ b/vllm/model_executor/models/gritlm.py @@ -16,8 +16,8 @@ from vllm.model_executor.layers.pooler import (DispatchPooler, Pooler, get_prompt_token_ids) from vllm.model_executor.models.llama import LlamaForCausalLM from vllm.model_executor.pooling_metadata import PoolingMetadata -from vllm.pooling_params import PoolingTask from vllm.sequence import PoolerOutput +from vllm.tasks import PoolingTask from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config from .interfaces import SupportsV0Only diff --git a/vllm/model_executor/models/modernbert.py b/vllm/model_executor/models/modernbert.py index be1c3438d9db1..fc2b0c1f51821 100644 --- a/vllm/model_executor/models/modernbert.py +++ b/vllm/model_executor/models/modernbert.py @@ -23,8 +23,8 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.pooling_metadata import PoolingMetadata -from vllm.pooling_params import PoolingTask from vllm.sequence import IntermediateTensors +from vllm.tasks import PoolingTask from .interfaces import SupportsCrossEncoding, SupportsV0Only from .utils import WeightsMapper, maybe_prefix diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py index 868facbe2557a..23eb775f2dc69 100644 --- a/vllm/pooling_params.py +++ b/vllm/pooling_params.py @@ -1,17 +1,16 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import TYPE_CHECKING, Literal, Optional +from typing import TYPE_CHECKING, Optional import msgspec from vllm.sampling_params import RequestOutputKind +from vllm.tasks import PoolingTask if TYPE_CHECKING: from vllm.config import ModelConfig -PoolingTask = Literal["encode", "embed", "classify", "score"] - class PoolingParams( msgspec.Struct, diff --git a/vllm/tasks.py b/vllm/tasks.py new file mode 100644 index 0000000000000..85c5c6e436205 --- /dev/null +++ b/vllm/tasks.py @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import Literal, get_args + +GenerationTask = Literal["generate", "transcription"] +GENERATION_TASKS = get_args(GenerationTask) + +PoolingTask = Literal["encode", "embed", "classify", "score"] +POOLING_TASKS = get_args(PoolingTask) + +SupportedTask = Literal[GenerationTask, PoolingTask] diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 02cb80197fa47..ed0d9620f4762 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -21,6 +21,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry from vllm.outputs import PoolingRequestOutput, RequestOutput from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingParams +from vllm.tasks import SupportedTask from vllm.transformers_utils.config import ( maybe_register_config_serialize_by_value) from vllm.transformers_utils.tokenizer import AnyTokenizer @@ -211,6 +212,9 @@ class AsyncLLM(EngineClient): if handler := getattr(self, "output_handler", None): handler.cancel() + async def get_supported_tasks(self) -> tuple[SupportedTask, ...]: + return await self.engine_core.get_supported_tasks_async() + async def add_request( self, request_id: str, diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 88c511606d7c5..4124ee05326ce 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -23,6 +23,7 @@ from vllm.executor.multiproc_worker_utils import _add_prefix from vllm.logger import init_logger from vllm.logging_utils.dump_input import dump_engine_exception from vllm.lora.request import LoRARequest +from vllm.tasks import POOLING_TASKS, SupportedTask from vllm.transformers_utils.config import ( maybe_register_config_serialize_by_value) from vllm.utils import (bind_process_name, make_zmq_socket, @@ -195,11 +196,17 @@ class EngineCore: "warmup model) took %.2f seconds"), elapsed) return num_gpu_blocks, num_cpu_blocks, scheduler_kv_cache_config + def get_supported_tasks(self) -> tuple[SupportedTask, ...]: + return self.model_executor.supported_tasks + def add_request(self, request: EngineCoreRequest): """Add request to the scheduler.""" if pooling_params := request.pooling_params: - supported_pooling_tasks = ( - self.model_executor.supported_pooling_tasks) + supported_pooling_tasks = [ + task for task in self.get_supported_tasks() + if task in POOLING_TASKS + ] + if pooling_params.task not in supported_pooling_tasks: raise ValueError(f"Unsupported task: {pooling_params.task!r} " f"Supported tasks: {supported_pooling_tasks}") diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 69ae3690d00e9..b14d85bbf8e9d 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -21,6 +21,7 @@ import zmq.asyncio from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.lora.request import LoRARequest +from vllm.tasks import SupportedTask from vllm.utils import get_open_port, get_open_zmq_inproc_path, make_zmq_socket from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest, EngineCoreRequestType, @@ -104,6 +105,9 @@ class EngineCoreClient(ABC): def get_output(self) -> EngineCoreOutputs: raise NotImplementedError + def get_supported_tasks(self) -> tuple[SupportedTask, ...]: + raise NotImplementedError + def add_request(self, request: EngineCoreRequest) -> None: raise NotImplementedError @@ -170,6 +174,9 @@ class EngineCoreClient(ABC): async def get_output_async(self) -> EngineCoreOutputs: raise NotImplementedError + async def get_supported_tasks_async(self) -> tuple[SupportedTask, ...]: + raise NotImplementedError + async def add_request_async(self, request: EngineCoreRequest) -> None: raise NotImplementedError @@ -238,6 +245,9 @@ class InprocClient(EngineCoreClient): outputs, _ = self.engine_core.step() return outputs.get(0) or EngineCoreOutputs() + def get_supported_tasks(self) -> tuple[SupportedTask, ...]: + return self.engine_core.get_supported_tasks() + def add_request(self, request: EngineCoreRequest) -> None: self.engine_core.add_request(request) @@ -608,6 +618,9 @@ class SyncMPClient(MPClient): return future.result() + def get_supported_tasks(self) -> tuple[SupportedTask, ...]: + return self.call_utility("get_supported_tasks") + def add_request(self, request: EngineCoreRequest) -> None: if self.is_dp: self.engines_running = True @@ -802,6 +815,9 @@ class AsyncMPClient(MPClient): self._ensure_output_queue_task() return await future + async def get_supported_tasks_async(self) -> tuple[SupportedTask, ...]: + return await self.call_utility_async("get_supported_tasks") + async def add_request_async(self, request: EngineCoreRequest) -> None: request.client_index = self.client_index await self._send_input(EngineCoreRequestType.ADD, request) diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 991242e18278d..efbdffbc0900d 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -18,6 +18,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry from vllm.outputs import PoolingRequestOutput, RequestOutput from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingParams +from vllm.tasks import SupportedTask from vllm.transformers_utils.tokenizer_group import ( TokenizerGroup, init_tokenizer_from_configs) from vllm.usage.usage_lib import UsageContext @@ -176,6 +177,9 @@ class LLMEngine: def validate_outputs(cls, outputs, output_type): return outputs + def get_supported_tasks(self) -> tuple[SupportedTask, ...]: + return self.engine_core.get_supported_tasks() + def abort_request(self, request_ids: list[str]) -> None: """Remove request_ids from EngineCore and Detokenizer.""" diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 32004ced4aae0..5fe594db667a5 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -30,15 +30,17 @@ from vllm.logger import init_logger from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaBase from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding from vllm.model_executor.model_loader import TensorizerLoader, get_model_loader -from vllm.model_executor.models.interfaces import is_mixture_of_experts -from vllm.model_executor.models.interfaces_base import (VllmModelForPooling, - is_pooling_model) +from vllm.model_executor.models.interfaces import (is_mixture_of_experts, + supports_transcription) +from vllm.model_executor.models.interfaces_base import ( + VllmModelForPooling, is_pooling_model, is_text_generation_model) from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange from vllm.multimodal.utils import group_mm_inputs_by_modality -from vllm.pooling_params import PoolingParams, PoolingTask +from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingType from vllm.sequence import IntermediateTensors, PoolerOutput +from vllm.tasks import GenerationTask, PoolingTask, SupportedTask from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, GiB_bytes, LazyLoader, check_use_alibi, get_dtype_size, is_pin_memory_available, round_up) @@ -1153,6 +1155,21 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): def get_model(self) -> nn.Module: return self.model + def get_supported_generation_tasks(self) -> list[GenerationTask]: + model = self.get_model() + supported_tasks = list[GenerationTask]() + + if is_text_generation_model(model): + supported_tasks.append("generate") + + if supports_transcription(model): + if model.supports_transcription_only: + return ["transcription"] + + supported_tasks.append("transcription") + + return supported_tasks + def get_supported_pooling_tasks(self) -> list[PoolingTask]: model = self.get_model() if not is_pooling_model(model): @@ -1160,6 +1177,16 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): return list(model.pooler.get_supported_tasks()) + def get_supported_tasks(self) -> tuple[SupportedTask, ...]: + tasks = list[SupportedTask]() + + if self.model_config.runner_type == "generate": + tasks.extend(self.get_supported_generation_tasks()) + if self.model_config.runner_type == "pooling": + tasks.extend(self.get_supported_pooling_tasks()) + + return tuple(tasks) + def apply_grammar_bitmask( self, scheduler_output: "SchedulerOutput", diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 522946351148b..dcfb038d28c20 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -23,8 +23,8 @@ from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.model_executor import set_random_seed from vllm.platforms import current_platform -from vllm.pooling_params import PoolingTask from vllm.sequence import IntermediateTensors +from vllm.tasks import SupportedTask from vllm.utils import GiB_bytes, MemorySnapshot, memory_profiling from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec @@ -320,8 +320,8 @@ class Worker(WorkerBase): def get_model(self) -> nn.Module: return self.model_runner.get_model() - def get_supported_pooling_tasks(self) -> list[PoolingTask]: - return self.model_runner.get_supported_pooling_tasks() + def get_supported_tasks(self) -> tuple[SupportedTask, ...]: + return self.model_runner.get_supported_tasks() @torch.inference_mode() def execute_model( diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index e8c8008458960..59cbb0150570b 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -27,13 +27,15 @@ from vllm.logger import init_logger from vllm.lora.layers import BaseLayerWithLoRA from vllm.model_executor.model_loader import get_model_loader from vllm.model_executor.model_loader.tpu import TPUModelLoader -from vllm.model_executor.models.interfaces_base import is_pooling_model +from vllm.model_executor.models.interfaces import supports_transcription +from vllm.model_executor.models.interfaces_base import ( + is_pooling_model, is_text_generation_model) from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (BatchedTensorInputs, MultiModalKwargs, PlaceholderRange) from vllm.multimodal.utils import group_mm_inputs_by_modality -from vllm.pooling_params import PoolingTask from vllm.sequence import IntermediateTensors +from vllm.tasks import GenerationTask, PoolingTask, SupportedTask from vllm.utils import (LayerBlockType, cdiv, is_pin_memory_available, prev_power_of_2) from vllm.v1.attention.backends.pallas import (TPU_STR_DTYPE_TO_TORCH_DTYPE, @@ -489,6 +491,21 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): def get_model(self) -> nn.Module: return self.model + def get_supported_generation_tasks(self) -> list[GenerationTask]: + model = self.get_model() + supported_tasks = list[GenerationTask]() + + if is_text_generation_model(model): + supported_tasks.append("generate") + + if supports_transcription(model): + if model.supports_transcription_only: + return ["transcription"] + + supported_tasks.append("transcription") + + return supported_tasks + def get_supported_pooling_tasks(self) -> list[PoolingTask]: model = self.get_model() if not is_pooling_model(model): @@ -496,6 +513,16 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): return list(model.pooler.get_supported_tasks()) + def get_supported_tasks(self) -> tuple[SupportedTask, ...]: + tasks = list[SupportedTask]() + + if self.model_config.runner_type == "generate": + tasks.extend(self.get_supported_generation_tasks()) + if self.model_config.runner_type == "pooling": + tasks.extend(self.get_supported_pooling_tasks()) + + return tuple(tasks) + def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]: """ Generates the KVCacheSpec by parsing the kv cache format from each diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py index 254b058d2cd32..72e0e4230a017 100644 --- a/vllm/v1/worker/tpu_worker.py +++ b/vllm/v1/worker/tpu_worker.py @@ -21,7 +21,7 @@ from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.model_executor import set_random_seed from vllm.platforms import current_platform -from vllm.pooling_params import PoolingTask +from vllm.tasks import SupportedTask from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, cdiv from vllm.v1.attention.backends.pallas import TPU_HEAD_SIZE_ALIGNMENT from vllm.v1.core.sched.output import SchedulerOutput @@ -282,8 +282,8 @@ class TPUWorker: def get_model(self) -> nn.Module: return self.model_runner.get_model() - def get_supported_pooling_tasks(self) -> list[PoolingTask]: - return self.model_runner.get_supported_pooling_tasks() + def get_supported_tasks(self) -> tuple[SupportedTask, ...]: + return self.model_runner.get_supported_tasks() def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]: return self.model_runner.get_kv_cache_spec() diff --git a/vllm/worker/model_runner_base.py b/vllm/worker/model_runner_base.py index feca8a7a1e74f..7b8fe2f802d68 100644 --- a/vllm/worker/model_runner_base.py +++ b/vllm/worker/model_runner_base.py @@ -12,9 +12,11 @@ import torch.nn as nn from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.model_executor.models.interfaces_base import is_pooling_model -from vllm.pooling_params import PoolingTask +from vllm.model_executor.models.interfaces import supports_transcription +from vllm.model_executor.models.interfaces_base import ( + is_pooling_model, is_text_generation_model) from vllm.sequence import IntermediateTensors, SequenceGroupMetadata +from vllm.tasks import GenerationTask, PoolingTask, SupportedTask if TYPE_CHECKING: from vllm.attention import AttentionMetadata @@ -224,6 +226,21 @@ class ModelRunnerBase(ABC, Generic[T]): def get_model(self) -> nn.Module: raise NotImplementedError + def get_supported_generation_tasks(self) -> list[GenerationTask]: + model = self.get_model() + supported_tasks = list[GenerationTask]() + + if is_text_generation_model(model): + supported_tasks.append("generate") + + if supports_transcription(model): + if model.supports_transcription_only: + return ["transcription"] + + supported_tasks.append("transcription") + + return supported_tasks + def get_supported_pooling_tasks(self) -> list[PoolingTask]: model = self.get_model() if not is_pooling_model(model): @@ -231,6 +248,16 @@ class ModelRunnerBase(ABC, Generic[T]): return list(model.pooler.get_supported_tasks()) + def get_supported_tasks(self) -> tuple[SupportedTask, ...]: + tasks = list[SupportedTask]() + + if self.model_config.runner_type == "generate": + tasks.extend(self.get_supported_generation_tasks()) + if self.model_config.runner_type == "pooling": + tasks.extend(self.get_supported_pooling_tasks()) + + return tuple(tasks) + def execute_model( self, model_input: T, From f3a683b7c9df8b251092e48e53d58220bb920f2c Mon Sep 17 00:00:00 2001 From: Mengqing Cao Date: Fri, 25 Jul 2025 20:53:07 +0800 Subject: [PATCH 04/57] [Bugfix][Logprobs] Fix logprobs op to support more backend (#21591) Signed-off-by: MengqingCao --- vllm/v1/sample/ops/logprobs.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/v1/sample/ops/logprobs.py b/vllm/v1/sample/ops/logprobs.py index a4d65485140ec..82875b7c84522 100644 --- a/vllm/v1/sample/ops/logprobs.py +++ b/vllm/v1/sample/ops/logprobs.py @@ -4,8 +4,10 @@ import torch +from vllm.platforms import current_platform -@torch.compile(dynamic=True) + +@torch.compile(dynamic=True, backend=current_platform.simple_compile_backend) def batched_count_greater_than(x: torch.Tensor, values: torch.Tensor) -> torch.Tensor: """ From c72f049cb4c98b92cb85fc791e74d9220a263cf2 Mon Sep 17 00:00:00 2001 From: xyxinyang <43821961+xyxinyang@users.noreply.github.com> Date: Fri, 25 Jul 2025 21:02:53 +0800 Subject: [PATCH 05/57] [Model] Fix Ernie4.5MoE e_score_correction_bias parameter (#21586) Signed-off-by: zhouchong Co-authored-by: zhouchong --- vllm/model_executor/models/ernie45_moe.py | 25 +++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/vllm/model_executor/models/ernie45_moe.py b/vllm/model_executor/models/ernie45_moe.py index 984003e62d11a..5824b0967e773 100644 --- a/vllm/model_executor/models/ernie45_moe.py +++ b/vllm/model_executor/models/ernie45_moe.py @@ -123,14 +123,19 @@ class Ernie4_5_MoeMoE(nn.Module): quant_config=None, prefix=f"{prefix}.gate") - self.experts = FusedMoE(num_experts=config.moe_num_experts, - top_k=config.moe_k, - hidden_size=config.hidden_size, - intermediate_size=config.moe_intermediate_size, - reduce_results=False, - renormalize=True, - quant_config=quant_config, - prefix=f"{prefix}.experts") + self.gate.e_score_correction_bias = nn.Parameter( + torch.empty(config.moe_num_experts)) + + self.experts = FusedMoE( + num_experts=config.moe_num_experts, + top_k=config.moe_k, + hidden_size=config.hidden_size, + intermediate_size=config.moe_intermediate_size, + reduce_results=False, + renormalize=True, + quant_config=quant_config, + prefix=f"{prefix}.experts", + e_score_correction_bias=self.gate.e_score_correction_bias) if self.moe_num_shared_experts is not None: intermediate_size = (config.moe_intermediate_size * @@ -459,6 +464,10 @@ class Ernie4_5_MoeModel(nn.Module): if "mtp" in name: continue + if "e_score_correction_bias" in name: + name = name.replace("moe_statics", "gate") + loaded_weight = loaded_weight.squeeze(0) + for (param_name, weight_name, shard_id) in stacked_params_mapping: # Skip non-stacked layers and experts (experts handled below). if weight_name not in name: From 29c6fbe58cfa705c26ed1b38f262d5ade0b4f9ba Mon Sep 17 00:00:00 2001 From: bigshanedogg Date: Fri, 25 Jul 2025 22:05:42 +0900 Subject: [PATCH 06/57] [MODEL] New model support for naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B (#20931) Signed-off-by: bigshanedogg --- docs/models/supported_models.md | 1 + examples/offline_inference/vision_language.py | 80 ++ .../vision_language_multi_image.py | 48 + .../multimodal/processing/test_common.py | 1 + tests/models/registry.py | 3 + .../models/hyperclovax_vision.py | 1231 +++++++++++++++++ vllm/model_executor/models/registry.py | 1 + 7 files changed, 1365 insertions(+) create mode 100644 vllm/model_executor/models/hyperclovax_vision.py diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 0143d137ff3f9..0f3b730eabedc 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -365,6 +365,7 @@ th { | `Grok1ModelForCausalLM` | Grok1 | `hpcai-tech/grok-1`. | ✅︎ | ✅︎ | ✅︎ | | `HunYuanDenseV1ForCausalLM` | Hunyuan-7B-Instruct-0124 | `tencent/Hunyuan-7B-Instruct-0124` | ✅︎ | | ✅︎ | | `HunYuanMoEV1ForCausalLM` | Hunyuan-80B-A13B | `tencent/Hunyuan-A13B-Instruct`, `tencent/Hunyuan-A13B-Pretrain`, `tencent/Hunyuan-A13B-Instruct-FP8`, etc. | ✅︎ | | ✅︎ | +| `HCXVisionForCausalLM` | HyperCLOVAX-SEED-Vision-Instruct-3B | `naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B` | | | ✅︎ | | `InternLMForCausalLM` | InternLM | `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc. | ✅︎ | ✅︎ | ✅︎ | | `InternLM2ForCausalLM` | InternLM2 | `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc. | ✅︎ | ✅︎ | ✅︎ | | `InternLM3ForCausalLM` | InternLM3 | `internlm/internlm3-8b-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index e4811c023377f..eb6b410848558 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -316,6 +316,85 @@ def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData: ) +# naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B +def run_hyperclovax_seed_vision( + questions: list[str], modality: str +) -> ModelRequestData: + model_name = "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B" + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + + engine_args = EngineArgs( + model=model_name, + trust_remote_code=True, + max_model_len=8192 if modality == "image" else 16384, + limit_mm_per_prompt={modality: 1}, + ) + + messages = list() + for question in questions: + if modality == "image": + """ + ocr: List the words in the image in raster order. + Even if the word order feels unnatural for reading, + the model will handle it as long as it follows raster order. + e.g. "Naver, CLOVA, bigshane" + lens_keywords: List the entity names in the image. + e.g. "iPhone" + lens_local_keywords: List the entity names with quads in the image. + e.g. "[0.07, 0.21, 0.92, 0.90] iPhone" + """ + messages.append( + [ + { + "role": "user", + "content": [ + { + "type": "image", + "ocr": "", + "lens_keywords": "", + "lens_local_keywords": "", + }, + { + "type": "text", + "text": question, + }, + ], + } + ] + ) + elif modality == "video": + messages.append( + [ + { + "role": "user", + "content": [ + { + "type": "video", + }, + { + "type": "text", + "text": question, + }, + ], + } + ] + ) + else: + raise ValueError(f"Unsupported modality: {modality}") + + prompts = tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True, + ) + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + stop_token_ids=None, + ) + + # Idefics3-8B-Llama3 def run_idefics3(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" @@ -1222,6 +1301,7 @@ model_example_map = { "glm4v": run_glm4v, "glm4_1v": run_glm4_1v, "h2ovl_chat": run_h2ovl, + "hyperclovax_seed_vision": run_hyperclovax_seed_vision, "idefics3": run_idefics3, "internvl_chat": run_internvl, "nemotron_vl": run_nemotron_vl, diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py index eb4f3b6c8f449..2e14fc807e104 100644 --- a/examples/offline_inference/vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -289,6 +289,53 @@ def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData: ) +def load_hyperclovax_seed_vision( + question: str, image_urls: list[str] +) -> ModelRequestData: + model_name = "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B" + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + + engine_args = EngineArgs( + model=model_name, + trust_remote_code=True, + max_model_len=16384, + limit_mm_per_prompt={"image": len(image_urls)}, + ) + + message = {"role": "user", "content": list()} + for _image_url in image_urls: + message["content"].append( + { + "type": "image", + "image": _image_url, + "ocr": "", + "lens_keywords": "", + "lens_local_keywords": "", + } + ) + message["content"].append( + { + "type": "text", + "text": question, + } + ) + + prompt = tokenizer.apply_chat_template( + [ + message, + ], + tokenize=False, + add_generation_prompt=True, + ) + + return ModelRequestData( + engine_args=engine_args, + prompt=prompt, + stop_token_ids=None, + image_data=[fetch_image(url) for url in image_urls], + ) + + def load_llava(question: str, image_urls: list[str]) -> ModelRequestData: # NOTE: CAUTION! Original Llava models wasn't really trained on multi-image inputs, # it will generate poor response for multi-image inputs! @@ -900,6 +947,7 @@ model_example_map = { "h2ovl_chat": load_h2ovl, "idefics3": load_idefics3, "internvl_chat": load_internvl, + "hyperclovax_seed_vision": load_hyperclovax_seed_vision, "keye_vl": load_keye_vl, "kimi_vl": load_kimi_vl, "llava": load_llava, diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index fd5842523178f..c2e9a73fa82f0 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -278,6 +278,7 @@ def _test_processing_correctness_one( "HuggingFaceTB/SmolVLM2-2.2B-Instruct", "moonshotai/Kimi-VL-A3B-Instruct", "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B", "llava-hf/llava-1.5-7b-hf", "llava-hf/llava-v1.6-mistral-7b-hf", "llava-hf/LLaVA-NeXT-Video-7B-hf", diff --git a/tests/models/registry.py b/tests/models/registry.py index 3b92462e58a85..1800262ced67f 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -201,6 +201,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { trust_remote_code=True), "HunYuanDenseV1ForCausalLM":_HfExamplesInfo("tencent/Hunyuan-7B-Instruct-0124", trust_remote_code=True), + "HCXVisionForCausalLM": _HfExamplesInfo( + "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B", + trust_remote_code=True), "InternLMForCausalLM": _HfExamplesInfo("internlm/internlm-chat-7b", trust_remote_code=True), "InternLM2ForCausalLM": _HfExamplesInfo("internlm/internlm2-chat-7b", diff --git a/vllm/model_executor/models/hyperclovax_vision.py b/vllm/model_executor/models/hyperclovax_vision.py new file mode 100644 index 0000000000000..3e8e50b35c0b7 --- /dev/null +++ b/vllm/model_executor/models/hyperclovax_vision.py @@ -0,0 +1,1231 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# copied from : https://github.com/huggingface/transformers +import ast +import sys +from collections import defaultdict +from collections.abc import Iterable, Mapping, Sequence +from functools import partial +from itertools import chain +from typing import Any, Literal, Optional, TypedDict, Union + +import numpy as np +import PIL +from einops import rearrange +from PIL import Image + +if sys.version_info >= (3, 11): + import typing + Unpack = typing.Unpack +else: + import typing_extensions + Unpack = typing_extensions.Unpack + +import torch +import torch.nn as nn +from timm.layers import LayerNorm, LayerNorm2d +from timm.models.regnet import RegStage +from transformers import (AutoProcessor, BatchFeature, CLIPVisionConfig, + SiglipVisionConfig) +from transformers.modeling_utils import no_init_weights + +from vllm.config import VllmConfig +from vllm.inputs import InputProcessingContext +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, + MultiModalKwargs) +from vllm.multimodal.parse import ImageSize, MultiModalDataItems +from vllm.multimodal.processing import (BaseMultiModalProcessor, + BaseProcessingInfo, ProcessingCache, + PromptReplacement, PromptUpdate) +from vllm.multimodal.profiling import BaseDummyInputsBuilder +from vllm.sequence import IntermediateTensors + +from .clip import CLIPVisionModel +from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP +from .siglip import SiglipVisionModel +from .utils import AutoWeightsLoader, init_vllm_registered_model, maybe_prefix +from .vision import get_vision_encoder_info + +EOT = "<|endofturn|>" +IMAGE_TOKEN: str = "<|dummy3|>" +VIDEO_TOKEN: str = "<|_unuse_missing_100270|>" + + +class HCXVisionMultimodalPixelInputs(TypedDict): + type: Literal["pixel_values"] + pixel_values_images: list[torch.Tensor] + """ + Shape: `[(num_grids, num_channels, height, width), ...]` if anyres + + Note that `height` or `width` may be different per batch and image, + in which case the data is passed as a list instead of a batched tensor. + """ + image_sizes_images: list[tuple[Union[int, float]]] + """ + Shape: `[(height, width), ...]` + """ + vision_query_lengths_images: list[Union[int, float]] + pixel_values_videos: list[tuple[Union[int, float]]] + """ + Shape: `[(num_grids, num_channels, height, width), ...]` if anyres + """ + vision_query_lengths_videos: list[Union[int, float]] + + +HCXVisionMultimodalInputs = Union[HCXVisionMultimodalPixelInputs] + + +class HCXVisionProcessingInfo(BaseProcessingInfo): + + def get_hf_config(self): + return self.ctx.get_hf_config() + + def get_vision_encoder_info(self): + return get_vision_encoder_info(self.get_hf_config()) + + def get_hf_processor( + self, + **kwargs: object, + ): + processor_cls = type( + AutoProcessor.from_pretrained( + self.ctx.model_config.model, + trust_remote_code=self.ctx.model_config.trust_remote_code, + )) + return self.ctx.get_hf_processor( + processor_cls, + **kwargs, + ) + + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"image": None, "video": None} + + def get_num_image_tokens( + self, + *, + vision_query_length: Union[int, list[int]], + ) -> int: + if isinstance(vision_query_length, int): + return vision_query_length + else: + return sum(vision_query_length) + + def get_num_video_tokens( + self, + *, + vision_query_length: Union[int, list[int]], + ) -> int: + if isinstance(vision_query_length, int): + return vision_query_length + else: + return sum(vision_query_length) + + def get_image_size_with_most_features(self) -> ImageSize: + vision_encoder_info = self.get_vision_encoder_info() + width = height = vision_encoder_info.get_image_size() + return ImageSize(width=width, height=height) + + def get_max_image_tokens(self) -> int: + target_width, target_height = self.get_image_size_with_most_features() + + return self.get_num_image_tokens( + image_width=target_width, + image_height=target_height, + ) + + +class HCXVisionDummyInputsBuilder( + BaseDummyInputsBuilder[HCXVisionProcessingInfo]): + + def get_dummy_text( + self, + mm_counts: Mapping[str, int], + ) -> str: + dummy_text = IMAGE_TOKEN * mm_counts.get( + "image", 0) + VIDEO_TOKEN * mm_counts.get("video", 0) + return dummy_text + + def get_dummy_mm_data( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> MultiModalDataDict: + num_images = mm_counts.get("image", 0) + num_videos = mm_counts.get("video", 0) + + target_width, target_height = \ + self.info.get_image_size_with_most_features() + target_num_frames = 32 + return { + "image": + self._get_dummy_images( + width=target_width, + height=target_height, + num_images=num_images, + ), + "video": + self._get_dummy_videos( + width=target_width - 1, + height=target_height - 1, + num_frames=target_num_frames, + num_videos=num_videos, + ) + } + + +class HCXVisionMultiModalProcessor( + BaseMultiModalProcessor[HCXVisionProcessingInfo]): + + def _call_hf_processor( + self, + prompt: str, + mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object], + tok_kwargs: Mapping[str, object], + ) -> BatchFeature: + + def replace_multimodal_token( + token_ids: torch.Tensor, + target_token: int, + repeats: list, + ): + output = list() + _repeats_idx = 0 + for token_id in token_ids: + if token_id == target_token: + output += [ + token_id.item(), + ] * repeats[_repeats_idx] + _repeats_idx += 1 + else: + output += [ + token_id.item(), + ] + return torch.tensor(output, device=token_ids.device) + + for video_idx, video_arr in enumerate(mm_data.get("videos", list())): + if video_arr.dtype == np.uint8: + continue + mm_data["videos"][video_idx] = video_arr.astype(np.uint8) + + processed_outputs = self.info.ctx.call_hf_processor( + hf_processor=self.info.get_hf_processor(**mm_kwargs), + data=dict( + text=prompt, + images=None, + videos=None, + ), + ) # text-only + + if len(mm_data) > 0: + # batchify input as a single item + images = mm_data.get("images", None) + num_images = 0 + if images is not None: + num_images = len(images) + images = [ + images, + ] # batchify + + videos = mm_data.get("videos", + None) # list of video in single conversation + num_videos = 0 + if videos is not None: + num_videos = len(videos) + videos = [ + videos, + ] # batchify + + _processed_outputs = self.info.ctx.call_hf_processor( + hf_processor=self.info.get_hf_processor(**mm_kwargs), + data=dict( + text=None, + images=images, + videos=videos, + ), + ) # mm-only + + for k, v in _processed_outputs.items(): + if len(v) < 1: + continue + elif k.endswith("_images"): + # list of list of 4D tensor -> list of 4D tensor + _processed_outputs[k] = v[0] + elif k.endswith("_videos"): + # list of list of 4D tensor -> list of 4D tensor + v = v[0] + if k == "pixel_values_videos": + v = torch.cat(v, dim=0) + _c, _w, _h = v.shape[-3:] + v = v.reshape(num_videos, -1, _c, _w, _h) + v = list(torch.unbind(v, dim=0)) + _processed_outputs[k] = v + + if num_images > 0: + tokenizer = self.info.get_tokenizer() + processed_outputs["input_ids"] = torch.stack([ + replace_multimodal_token( + token_ids=_input_ids, + target_token=tokenizer.convert_tokens_to_ids( + IMAGE_TOKEN), + repeats=_processed_outputs[ + "vision_query_lengths_images"], + ) for _input_ids in processed_outputs["input_ids"] + ], + dim=0) + + if num_videos > 0: + tokenizer = self.info.get_tokenizer() + processed_outputs["input_ids"] = torch.stack([ + replace_multimodal_token( + token_ids=_input_ids, + target_token=tokenizer.convert_tokens_to_ids( + VIDEO_TOKEN), + repeats=_processed_outputs[ + "vision_query_lengths_videos"], + ) for _input_ids in processed_outputs["input_ids"] + ], + dim=0) + + _ratios = [ + len(_pixel_values) for _pixel_values in + _processed_outputs["pixel_values_videos"] + ] + _num_per_videos = [ + int(_e / sum(_ratios) * + len(_processed_outputs["vision_query_lengths_videos"])) + for _e in _ratios + ] + _processed_outputs["vision_query_lengths_videos"] = [ + _processed_outputs["vision_query_lengths_videos"] + [sum(_num_per_videos[:_i]):sum(_num_per_videos[:_i + 1])] + for _i in range(0, num_videos) + ] + + processed_outputs.update(_processed_outputs) + + return processed_outputs + + def _get_prompt_updates( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, + ) -> Sequence[PromptUpdate]: + hf_config = self.info.get_hf_config() + placeholder = { + "image": hf_config.image_token_id, + "video": hf_config.video_token_id, + } + + def get_replacement_hyperclovax( + item_idx: int, + modality: str, + out_mm_kwargs: MultiModalKwargs, + ): + num_tokens = None + if modality == "image": + num_tokens = self.info.get_num_image_tokens( + vision_query_length=out_mm_kwargs[ + "vision_query_lengths_images"][item_idx], ) + if modality == "video": + num_tokens = self.info.get_num_video_tokens( + vision_query_length=out_mm_kwargs[ + "vision_query_lengths_videos"][item_idx], ) + assert isinstance(num_tokens, int) + return [ + placeholder[modality], + ] * num_tokens + + return [ + PromptReplacement( + modality=modality, + target=[ + placeholder[modality], + ], + replacement=partial( + get_replacement_hyperclovax, + modality=modality, + out_mm_kwargs=out_mm_kwargs, + ), + ) for modality in ("image", "video") + ] + + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict( + # image + pixel_values_images=MultiModalFieldConfig.batched("image"), + image_sizes_images=MultiModalFieldConfig.batched("image"), + vision_query_lengths_images=MultiModalFieldConfig.batched("image"), + num_queries_vis_abstractors_images=MultiModalFieldConfig.batched( + "image"), + num_queries_vis_abstractors_slow_images=MultiModalFieldConfig. + batched("image"), + first_last_frames_slows_images=MultiModalFieldConfig.batched( + "image"), + # video + pixel_values_videos=MultiModalFieldConfig.batched("video"), + image_sizes_videos=MultiModalFieldConfig.batched("video"), + vision_query_lengths_videos=MultiModalFieldConfig.batched("video"), + num_queries_vis_abstractors_videos=MultiModalFieldConfig.batched( + "video"), + num_queries_vis_abstractors_slow_videos=MultiModalFieldConfig. + batched("video"), + first_last_frames_slows_videos=MultiModalFieldConfig.batched( + "video"), + ) + + +def _build_hcxvision_hf_info( + ctx: InputProcessingContext, ) -> HCXVisionProcessingInfo: + return HCXVisionProcessingInfo(ctx) + + +def _build_hcxvision_hf_processor( + info: HCXVisionProcessingInfo, + dummy_inputs: BaseDummyInputsBuilder[HCXVisionProcessingInfo], + *, + cache: Optional[ProcessingCache] = None, +) -> BaseMultiModalProcessor: + if isinstance(info, HCXVisionProcessingInfo): + return HCXVisionMultiModalProcessor( + info, + dummy_inputs, # type: ignore + cache=cache, + ) + + raise NotImplementedError(type(info)) + + +def init_vision_tower_for_hcxvision( + vision_config, + quant_config: Optional[QuantizationConfig], + *, + use_nth_layer: Optional[int] = None, + require_post_norm: Optional[bool] = None, + prefix: str = "", +) -> Union[CLIPVisionModel, SiglipVisionModel]: + num_hidden_layers = vision_config.num_hidden_layers + if not isinstance(use_nth_layer, int): + pass + elif use_nth_layer >= 0: + num_hidden_layers = use_nth_layer + 1 + else: + num_hidden_layers = num_hidden_layers + use_nth_layer + 1 + + if isinstance(vision_config, CLIPVisionConfig): + return CLIPVisionModel( + vision_config, + quant_config=quant_config, + num_hidden_layers_override=num_hidden_layers, + require_post_norm=require_post_norm, + prefix=prefix, + ) + elif isinstance(vision_config, SiglipVisionConfig): + return SiglipVisionModel( + vision_config, + quant_config=quant_config, + num_hidden_layers_override=num_hidden_layers, + require_post_norm=require_post_norm, + prefix=prefix, + ) + + msg = f"Unsupported vision config: {type(vision_config)}" + raise NotImplementedError(msg) + + +class HCXVisionMlp(nn.Module): + + def __init__( + self, + mm_projector_type, + in_features, + hidden_features=None, + out_features=None, + act_layer=nn.GELU, + ): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.mm_projector_type = mm_projector_type + if self.mm_projector_type == "mlp": + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + elif self.mm_projector_type == "inverted_mlp": + self.fc1 = nn.Linear(in_features, 2 * hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(2 * hidden_features, out_features) + else: + raise NotImplementedError("{} is not implemented".format( + self.mm_projector_type)) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.fc2(x) + return x + + +class HCXVisionCAbstractor(nn.Module): + """ + This module is based on C-Abstractor, whose license is under apache-2.0. + You can check the original code at + https://github.com/khanrc/honeybee/blob/main/honeybee/projectors/projectors.py + and we made necessary modifications. + """ + + def __init__( + self, + num_queries: int, + num_input_tokens: int, + encoder_hidden_size: int, + hidden_size: int, + output_hidden_size: int, + pos_emb: bool = True, + prenorm: bool = False, + ): + super().__init__() + self.num_input_tokens = num_input_tokens + self.output_hidden_size = output_hidden_size + + # Positional embedding + if pos_emb: + self.pos_emb = torch.nn.Parameter( + torch.zeros(1, num_input_tokens, encoder_hidden_size)) + self.pos_emb.data.normal_(mean=0.0, std=0.02) + else: + self.pos_emb = None + + # (Optional) Pre-normalization layer + if prenorm: + self.prenorm = LayerNorm(encoder_hidden_size) + else: + self.prenorm = None + + self.build_net(num_queries, encoder_hidden_size, hidden_size, + output_hidden_size) + self.dtype = next(self.parameters()).dtype + + def forward( + self, + x: torch.Tensor, + num_queries_vis_abstractors: Optional[list[list[int]]] = None, + num_grids: Optional[list[int]] = None, + ) -> torch.Tensor: + if self.prenorm is not None: + x = self.prenorm(x) + + if self.pos_emb is not None: + x = x + self.pos_emb + + x = self._forward( + x, + num_queries_vis_abstractors=num_queries_vis_abstractors, + num_grids=num_grids, + ) # (B, L, output_hidden_size) + + return x + + def _forward( + self, + x: torch.Tensor, + num_queries_vis_abstractors: Optional[list[list[int]]] = None, + num_grids: Optional[list[int]] = None, + ) -> torch.Tensor: + # x: [B, L, dim] + B, L, dim = x.shape + hw = int(L**0.5) + x = rearrange(x, "b (h w) d -> b d h w", h=hw, w=hw) + + if num_queries_vis_abstractors is not None: + assert num_grids is not None + return self._forward_adaptive_num_query( + x, num_queries_vis_abstractors, num_grids) + + x = self.net(x) + x = rearrange(x, "b d h w -> b (h w) d") + x = self.readout(x) + return x + + def _forward_adaptive_num_query( + self, + x: torch.Tensor, + num_queries_vis_abstractors: Optional[list[list[int]]] = None, + num_grids: Optional[list[int]] = None, + ) -> list[torch.Tensor]: + # self.net is consisted by 3 layers (s1, sampler, s2) + assert len(self.net) == 3 + + x = self.net[0](x) # s1 + new_x = [] + for i, num_queries in enumerate(num_queries_vis_abstractors): + hw = int(num_queries**0.5) + sampler = nn.AdaptiveAvgPool2d((hw, hw)) + out = sampler(x[num_grids[i]:num_grids[i + 1], :]) + out = self.net[2](out) # s2 + + out = rearrange(out, "b d h w -> b (h w) d") + out = self.readout(out) + + new_x.append(out) + return new_x + + def build_net( + self, + n_queries: int, + encoder_hidden_size: int, + hidden_size: int, + output_hidden_size: int, + depth: int = 3, + mlp_depth: int = 2, + ): + assert (n_queries**0.5).is_integer( + ), f"n_queries must be square number. n_queries: {n_queries}" + hw = int(n_queries**0.5) + + # RegBlock = ResBlock + SE + RegBlock = partial( + RegStage, + stride=1, + dilation=1, + act_layer=nn.SiLU, + norm_layer=LayerNorm2d, + ) + + s1 = RegBlock( + depth, + encoder_hidden_size, + hidden_size, + ) + sampler = nn.AdaptiveAvgPool2d((hw, hw)) + s2 = RegBlock( + depth, + hidden_size, + hidden_size, + ) + + self.net = nn.Sequential(s1, sampler, s2) + self.readout = self.build_mlp(mlp_depth, hidden_size, + output_hidden_size) + + def build_mlp( + self, + depth: int, + hidden_size: int, + output_hidden_size: int, + ): + layers = [nn.Linear(hidden_size, output_hidden_size)] + for _ in range(1, depth): + layers.append(nn.SiLU()) + layers.append(nn.Linear(output_hidden_size, output_hidden_size)) + return nn.Sequential(*layers) + + +@MULTIMODAL_REGISTRY.register_processor( + _build_hcxvision_hf_processor, + info=_build_hcxvision_hf_info, + dummy_inputs=HCXVisionDummyInputsBuilder) +class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): + + packed_modules_mapping = { + "qkv_proj": ["q_proj", "k_proj", "v_proj"], + "gate_up_proj": ["gate_proj", "up_proj"] + } + + def __init__( + self, + *, + vllm_config: VllmConfig, + prefix: str = "", + **kwargs: Optional[Any], + ) -> None: + super().__init__() + + # init configs + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + # text_config + text_config = config.text_config + if text_config.model_type in ["gpt2", "hyperclovax", "llama"]: + text_config._attn_implementation = "sdpa" + if text_config.model_type != "hyperclovax": + text_config.logits_scaling = 1.0 + # vision_config + vision_config = config.vision_config + vision_config.auto_map = {} + vision_config.anyres = config.anyres + vision_config.max_num_grids = config.max_num_grids + self.dtype = vllm_config.model_config.dtype + + ## possible_resolution should be matched with preprocessor_config.json + config.possible_resolutions = self._init_possible_resolutions( + config, vision_config) + + # init models & parameters + with no_init_weights(): # weight will be loaded in from_pretrained + self.vision_model = init_vision_tower_for_hcxvision( + vision_config, + quant_config, + use_nth_layer=getattr(config, "use_nth_layer", -1), + require_post_norm=False, + prefix=maybe_prefix(prefix, "vision_model"), + ) + self.mm_projector = self._init_mm_projector(config, text_config, + vision_config) + + self.lm_head_vocab_size = getattr(text_config, "padded_vocab_size", + text_config.vocab_size) + self.language_model = init_vllm_registered_model( + vllm_config=vllm_config, + hf_config=text_config, + prefix=maybe_prefix(prefix, "language_model"), + ) + + if config.anyres: + self.image_newline = nn.Parameter( + torch.empty(text_config.hidden_size, dtype=self.dtype)) + + self.config = config + self.vision_config = vision_config + self.text_config = text_config + + # use_sum_loss = bool(kwargs.pop("use_sum_loss", False)) + # self.reduction = self._init_reduction_type(use_sum_loss) + + @classmethod + def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]: + if modality.startswith("image"): + return IMAGE_TOKEN + if modality.startswith("video"): + return VIDEO_TOKEN + + raise ValueError("Only image or video modality is supported") + + def get_language_model(self) -> torch.nn.Module: + return self.language_model + + def get_multimodal_embeddings( + self, + **kwargs: Unpack[HCXVisionMultimodalInputs], + ) -> Optional[MultiModalEmbeddings]: + + multimodal_embeddings = list() + if kwargs.get("pixel_values_images") is not None: + for _pixel_values_images, _image_sizes_images in zip( + kwargs["pixel_values_images"], + kwargs["image_sizes_images"]): + _pixel_values_images = _pixel_values_images.unsqueeze(dim=0) + _image_sizes_images = _image_sizes_images.unsqueeze(dim=0) + _len_pixel_values_images = [ + len(pixel_value) for pixel_value in _pixel_values_images + ] + if isinstance(_image_sizes_images, torch.Tensor): + _image_sizes_images = _image_sizes_images.detach().cpu( + ).tolist() + _multimodal_embeddings_images = self.forward_images( + pixel_values_images=_pixel_values_images, + image_sizes_images=_image_sizes_images, + len_pixel_values_images=_len_pixel_values_images, + ) + _multimodal_embeddings_images = torch.cat( + _multimodal_embeddings_images, dim=0) + multimodal_embeddings.append(_multimodal_embeddings_images) + + if kwargs.get("pixel_values_videos") is not None: + for _pixel_values_videos, _vision_query_lengths_videos in zip( + kwargs["pixel_values_videos"], + kwargs["vision_query_lengths_videos"]): + _len_pixel_values_videos = [ + len(_vision_query_lengths) + for _vision_query_lengths in _vision_query_lengths_videos + ] + _c, _w, _h = _pixel_values_videos.shape[-3:] + _pixel_values_videos = _pixel_values_videos.reshape( + sum(_len_pixel_values_videos), -1, _c, _w, + _h).unsqueeze(dim=0) + _multimodal_embeddings_videos = self.forward_videos( + pixel_values_videos=_pixel_values_videos, + len_pixel_values_videos=_len_pixel_values_videos, + ) + _multimodal_embeddings_videos = torch.cat( + _multimodal_embeddings_videos, dim=0) + multimodal_embeddings.append(_multimodal_embeddings_videos) + return multimodal_embeddings + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + multimodal_embeddings: Optional[MultiModalEmbeddings] = None, + **kwargs, + ) -> torch.Tensor: + inputs_embeds = self.language_model.get_input_embeddings(input_ids) + if (kwargs.get("pixel_values_images") is not None + or kwargs.get("pixel_values_videos") + is not None): # v0 compatibility + multimodal_embeddings = self.get_multimodal_embeddings(**kwargs) + if multimodal_embeddings is not None: + multimodal_embeddings = torch.cat(multimodal_embeddings, dim=0) + _mask_image = input_ids == self.config.image_token_id + _mask_video = input_ids == self.config.video_token_id + assert _mask_image.sum() + _mask_video.sum() == len( + multimodal_embeddings) + + if multimodal_embeddings.dtype != inputs_embeds.dtype: + multimodal_embeddings = multimodal_embeddings.to( + dtype=inputs_embeds.dtype) + if multimodal_embeddings.device != inputs_embeds.device: + multimodal_embeddings = multimodal_embeddings.to( + device=inputs_embeds.device) + + if _mask_image.sum() > 0: + inputs_embeds[ + _mask_image] = multimodal_embeddings[:sum(_mask_image)] + if _mask_video.sum() > 0: + inputs_embeds[_mask_video] = multimodal_embeddings[ + -sum(_mask_video):] + return inputs_embeds + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + **kwargs: object, + ) -> Union[torch.Tensor, IntermediateTensors]: + if intermediate_tensors is not None: + inputs_embeds = None + + # NOTE: In v1, inputs_embeds is always generated at model runner, this + # condition is for v0 compatibility. + elif inputs_embeds is None: + inputs_embeds = self.get_input_embeddings(input_ids=input_ids, + **kwargs) + input_ids = None + hidden_states = self.language_model.model(input_ids, + positions, + intermediate_tensors, + inputs_embeds=inputs_embeds) + return hidden_states + + def forward_images( + self, + pixel_values_images: list[list[torch.FloatTensor]], + image_sizes_images: list[list[tuple[int, int]]], + len_pixel_values_images: list[int], + ) -> list[list[torch.Tensor]]: + if sum(len_pixel_values_images) == 0: + return None + + concat_pixel_values_images = torch.cat(list( + chain(*pixel_values_images)), + dim=0) + + visual_token_idx = 0 if "siglip" in self.vision_config.model_type else 1 + image_forward_outs = self.vision_model( + concat_pixel_values_images)[:, visual_token_idx:] + + image_forward_outs = image_forward_outs.to( + dtype=self.mm_projector.dtype) + image_forward_outs = self.mm_projector(image_forward_outs) # b (h w) d + + split_sizes = [ + pixel_value.shape[0] for pixel_value in chain(*pixel_values_images) + ] + image_forward_outs = torch.split(image_forward_outs, + split_sizes, + dim=0) + + # newline for anyres postprocessing + image_features = anyres_postprocessing( + image_forward_outs=image_forward_outs, + image_sizes=[ + image_size for image_sizes in image_sizes_images + for image_size in image_sizes + ], + num_queries_vis_abstractor=self.config. + num_queries_vis_abstractor_image, + unpad=self.config.unpad, + patch_size=self.vision_config.patch_size, + grid_size=self.vision_config.image_size, + image_newline=self.image_newline, + possible_resolutions=self.config.possible_resolutions, + ) + return image_features + + def forward_videos( + self, + pixel_values_videos: list[list[torch.FloatTensor]], + len_pixel_values_videos: list[int], + ) -> list[torch.Tensor]: + + len_video_grids = sum(len_pixel_values_videos) + if len_video_grids == 0: + return None + + # Run Vision Model + concat_pixel_values_videos = torch.cat(list( + chain(*pixel_values_videos)), + dim=0) + + visual_token_idx = 0 if "siglip" in self.vision_config.model_type else 1 + video_forward_outs = self.vision_model( + concat_pixel_values_videos)[:, visual_token_idx:] + + video_forward_outs = video_forward_outs.to( + dtype=self.mm_projector.dtype) + + # Run MM-Projector + # len(num_grids) == len(num_queries_vis_abstractors) + 1 + grid_idx = 0 + num_grids = [ + grid_idx + ] # e.g. [0, 9, 18, 19, 27, 28, 36, 37, 45, 46, 54, 55, 56] + num_queries_vis_abstractors = [ + ] # e.g. [81, 81, 81, 9, 81, 9, 81, 9, 81, 9, 81, 9] + len_total_frames = video_forward_outs.shape[0] + + if self.config.first_last_frames_slow: + # slowfast (first_last_frames_slow) + assert len_total_frames != 0 + if len_total_frames <= 2: + num_queries_vis_abstractors.append( + self.config.num_queries_vis_abstractor_video_slow) + grid_idx += len_total_frames + num_grids.append(grid_idx) + else: + num_queries_vis_abstractors.append( + self.config.num_queries_vis_abstractor_video_slow) + grid_idx += 1 + num_grids.append(grid_idx) + + num_queries_vis_abstractors.append( + self.config.num_queries_vis_abstractor_video_fast) + grid_idx += len_total_frames - 2 + num_grids.append(grid_idx) + + num_queries_vis_abstractors.append( + self.config.num_queries_vis_abstractor_video_slow) + grid_idx += 1 + num_grids.append(grid_idx) + else: + # slowfast + for pixel_values_frames in pixel_values_videos: + for pixel_values_frame in pixel_values_frames: + if len(pixel_values_frame) > 0: + num_queries_vis_abstractors.append( + self.config.num_queries_vis_abstractor_video_slow) + grid_idx += 1 + num_grids.append(grid_idx) + num_queries_vis_abstractors.append( + self.config.num_queries_vis_abstractor_video_fast) + grid_idx = grid_idx + len(pixel_values_frame) - 1 + num_grids.append(grid_idx) + + video_forward_outs = self.mm_projector(video_forward_outs, + num_queries_vis_abstractors, + num_grids) + + video_features = [] # what we want to return + target_features = [] + target_group_size = 0 + group_counter = 0 + video_groups = [ + len(frame) for frames in pixel_values_videos for frame in frames + ] # for concat video features after projector + + for forward_out in video_forward_outs: + target_group_size += len(forward_out) + target_features.append(forward_out.flatten(0, 1)) + + video_group_size = video_groups[group_counter] + if video_group_size == target_group_size: + video_features.append(torch.cat(target_features, dim=0)) + target_features = [] + group_counter += 1 + target_group_size = 0 + + elif video_group_size < target_group_size: + raise RuntimeError(f"video_group_size < target_group_size!! \ + [{video_group_size} < {target_group_size}]") + + assert len(target_features + ) == 0, f"target_features is not empty!! {target_features}" + assert len(video_groups) == len(video_features) + + return video_features + + def _prepare_multimodal_kwargs(self, **kwargs: object): + output = defaultdict(list) + for k, v in kwargs.items(): + if len(v) < 1 or len(v[0]) < 1: + continue # if empty batch of empty sample + + new_k, is_video = k, False + if (not k.endswith("_images") and not k.endswith("_videos")): + pass + else: + new_k, is_video = k.split("_")[:-1], k.split("_")[-1] + new_k = "_".join(new_k) + is_video = is_video == "videos" + + for _sample_idx, _v in enumerate(v): # batch -> sample + if new_k not in ["pixel_values"]: + if len(output[new_k]) < _sample_idx + 1: + output[new_k].append(list()) + _v = _v.detach().cpu().numpy().tolist() + output[new_k][_sample_idx] += _v + elif isinstance(_v, torch.Tensor): + if len(output[new_k]) < _sample_idx + 1: + output[new_k].append(list()) + output["is_videos"].append(list()) + _v = list(torch.unbind(_v, dim=0)) + output[new_k][_sample_idx] += _v + output["is_videos"][_sample_idx] += [ + is_video, + ] * len(_v) + return dict(output) + + def compute_logits( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[torch.Tensor]: + return self.language_model.compute_logits(hidden_states, + sampling_metadata) + + def load_weights( + self, + weights: Iterable[tuple[str, torch.Tensor]], + ) -> set[str]: + loader = AutoWeightsLoader(self) + return loader.load_weights(weights) + + def _init_possible_resolutions( + self, + config, + vision_config, + ): + if not getattr(config, "possible_resolutions", []): + possible_resolutions = [] + if config.anyres: + assert config.max_num_grids > 0 + for i in range(1, config.max_num_grids + 1): + for j in range(1, config.max_num_grids + 1): + if i == 1 and j == 1 and not config.use_1x1_grid: + continue + if i * j <= config.max_num_grids: + possible_resolutions.append([i, j]) + + possible_resolutions = [[ + ys * vision_config.image_size, + xs * vision_config.image_size + ] for ys, xs in possible_resolutions] + return possible_resolutions + else: + return config.possible_resolutions + + def _init_mm_projector( + self, + config, + text_config, + vision_config, + ): + input_hidden_size = vision_config.hidden_size + if config.mm_projector_type == "linear": + mm_projector = nn.Linear(input_hidden_size, + text_config.hidden_size) + mm_projector.dtype = next(mm_projector.parameters()).dtype + elif config.mm_projector_type == "cabstractor": + mm_projector = HCXVisionCAbstractor( + num_queries=config.num_queries_vis_abstractor_image, + num_input_tokens=(vision_config.image_size // + vision_config.patch_size)**2, + encoder_hidden_size=input_hidden_size, + hidden_size=input_hidden_size, + output_hidden_size=text_config.hidden_size, + pos_emb=config.proj_pos_emb, + prenorm=config.proj_prenorm, + ) + else: + mm_projector = HCXVisionMlp( + config.mm_projector_type, + input_hidden_size, + hidden_features=input_hidden_size, + out_features=self.text_config.hidden_size, + ) + return mm_projector + + +def unpad_image(tensor: torch.Tensor, + original_size: tuple[int, int]) -> torch.Tensor: + original_width, original_height = original_size + current_height, current_width = tensor.shape[1:] + + original_aspect_ratio = original_width / original_height + current_aspect_ratio = current_width / current_height + + if original_aspect_ratio > current_aspect_ratio: + scale_factor = current_width / original_width + new_height = int(original_height * scale_factor) + padding = (current_height - new_height) // 2 + unpadded_tensor = tensor[:, padding:current_height - padding, :] + else: + scale_factor = current_height / original_height + new_width = int(original_width * scale_factor) + padding = (current_width - new_width) // 2 + unpadded_tensor = tensor[:, :, padding:current_width - padding] + + return unpadded_tensor + + +def select_best_resolution(original_size: tuple, + possible_resolutions: list) -> tuple: + original_height, original_width = original_size + best_fit = None + max_effective_resolution = 0 + min_wasted_resolution = float("inf") + + for height, width in possible_resolutions: + scale = min(width / original_width, height / original_height) + downscaled_width, downscaled_height = int(original_width * scale), int( + original_height * scale) + effective_resolution = min(downscaled_width * downscaled_height, + original_width * original_height) + wasted_resolution = (width * height) - effective_resolution + + if effective_resolution > max_effective_resolution or ( + effective_resolution == max_effective_resolution + and wasted_resolution < min_wasted_resolution): + max_effective_resolution = effective_resolution + min_wasted_resolution = wasted_resolution + best_fit = (height, width) + + return best_fit + + +def get_anyres_image_grid_shape( + image_size: tuple[int, int], + grid_pinpoints: Union[str, list[tuple[int, int]]], + patch_size: int, +) -> tuple[int, int]: + possible_resolutions = grid_pinpoints if isinstance( + grid_pinpoints, list) else ast.literal_eval(grid_pinpoints) + + original_width, original_height = image_size + height, width = select_best_resolution((original_height, original_width), + possible_resolutions) + return width // patch_size, height // patch_size + + +def reshape_and_unpad_image_features( + image_feature: torch.Tensor, + height: int, + width: int, + image_size: tuple[int, int], + possible_resolutions: list[tuple[int, int]], + grid_size: int, + unpad: bool, + image_newline: torch.Tensor, +) -> torch.Tensor: + base_image_feature = image_feature[0] + image_feature = image_feature[1:] + + assert (height * width == base_image_feature.shape[0] + ), f"height: {height}, width: {width}, \ + base_image_feature.shape[0]: {base_image_feature.shape[0]}" + + num_patch_width, num_patch_height = get_anyres_image_grid_shape( + image_size, possible_resolutions, grid_size) + image_feature = image_feature.view(num_patch_height, num_patch_width, + height, width, -1) + + if unpad: + image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous() + image_feature = image_feature.flatten(1, 2).flatten(2, 3) + image_feature = unpad_image(image_feature, image_size) + image_feature = torch.cat( + ( + image_feature, + image_newline[:, None, None].expand( + *image_feature.shape[:-1], 1).to(image_feature.device), + ), + dim=-1, + ) + image_feature = image_feature.flatten(1, 2).transpose(0, 1) + else: + image_feature = image_feature.permute(0, 2, 1, 3, 4).contiguous() + image_feature = image_feature.flatten(0, 3) + image_feature = torch.cat((base_image_feature, image_feature), dim=0) + + return image_feature + + +def anyres_postprocessing( + image_forward_outs: list[torch.FloatTensor], + image_sizes: list[list[int]], + possible_resolutions: list[tuple[int, int]], + patch_size: int, + grid_size: int, + image_newline: torch.FloatTensor, + num_queries_vis_abstractor: int = -1, + unpad: bool = False, +) -> list[torch.FloatTensor]: + height = width = grid_size // patch_size + + if num_queries_vis_abstractor > 0: + assert (num_queries_vis_abstractor**0.5 + ).is_integer(), "n_queries must be square number" + height = width = int(num_queries_vis_abstractor**0.5) + + # post-processing (unpad, add newline) + new_image_features = [] + for image_idx, image_feature in enumerate(image_forward_outs): + if image_feature.shape[0] > 1: + image_feature = reshape_and_unpad_image_features( + image_feature=image_feature, + height=height, + width=width, + image_size=image_sizes[image_idx], + possible_resolutions=possible_resolutions, + grid_size=grid_size, # Pass grid info if needed by helper + unpad=unpad, + image_newline=image_newline, + ) + else: + image_feature = image_feature[0] + image_feature = torch.cat( + (image_feature, image_newline[None].to(image_feature.device)), + dim=0) + new_image_features.append(image_feature) + image_features = new_image_features + return image_features + + +def resize_image( + image: Union[np.ndarray, PIL.Image.Image], + max_side: int = 378, +) -> np.ndarray: + image_arr = image + if isinstance(image, np.ndarray): + image = Image.fromarray(image) + + width, height = image.size + cur_max_size = max(width, height) + if cur_max_size <= max_side: + return image_arr + + scale = max_side / cur_max_size + width = int(width * scale) + height = int(height * scale) + image = image.resize((width, height), Image.LANCZOS) + image_arr = np.array(image) + return image_arr diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 7470b31e1253d..14a8ac7876f73 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -81,6 +81,7 @@ _TEXT_GENERATION_MODELS = { "Grok1ModelForCausalLM": ("grok1", "Grok1ForCausalLM"), "HunYuanMoEV1ForCausalLM": ("hunyuan_v1", "HunYuanMoEV1ForCausalLM"), "HunYuanDenseV1ForCausalLM": ("hunyuan_v1", "HunYuanDenseV1ForCausalLM"), + "HCXVisionForCausalLM": ("hyperclovax_vision", "HCXVisionForCausalLM"), "InternLMForCausalLM": ("llama", "LlamaForCausalLM"), "InternLM2ForCausalLM": ("internlm2", "InternLM2ForCausalLM"), "InternLM2VEForCausalLM": ("internlm2_ve", "InternLM2VEForCausalLM"), From 9fe98d42508f9a67d5b1c491a4a306966ded3976 Mon Sep 17 00:00:00 2001 From: kourosh hakhamaneshi <31483498+kouroshHakha@users.noreply.github.com> Date: Fri, 25 Jul 2025 06:49:11 -0700 Subject: [PATCH 07/57] [Frontend] Add request_id to the Request object so they can be controlled better via external load balancers (#21009) Signed-off-by: Kourosh Hakhamaneshi --- vllm/entrypoints/openai/protocol.py | 21 +++++++++++++++++++ vllm/entrypoints/openai/serving_completion.py | 4 +++- vllm/entrypoints/openai/serving_embedding.py | 5 +++-- 3 files changed, 27 insertions(+), 3 deletions(-) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 6c6ec207a3cac..b6b3bf3f530e3 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -1007,6 +1007,13 @@ class CompletionRequest(OpenAIBaseModel): "default: 0). Any priority other than 0 will raise an error " "if the served model does not use priority scheduling."), ) + request_id: str = Field( + default_factory=lambda: f"{random_uuid()}", + description=( + "The request_id related to this request. If the caller does " + "not set it, a random_uuid will be generated. This id is used " + "through out the inference process and return in response."), + ) logits_processors: Optional[LogitsProcessors] = Field( default=None, description=( @@ -1251,6 +1258,13 @@ class EmbeddingCompletionRequest(OpenAIBaseModel): "default: 0). Any priority other than 0 will raise an error " "if the served model does not use priority scheduling."), ) + request_id: str = Field( + default_factory=lambda: f"{random_uuid()}", + description=( + "The request_id related to this request. If the caller does " + "not set it, a random_uuid will be generated. This id is used " + "through out the inference process and return in response."), + ) # --8<-- [end:embedding-extra-params] @@ -1302,6 +1316,13 @@ class EmbeddingChatRequest(OpenAIBaseModel): "default: 0). Any priority other than 0 will raise an error " "if the served model does not use priority scheduling."), ) + request_id: str = Field( + default_factory=lambda: f"{random_uuid()}", + description=( + "The request_id related to this request. If the caller does " + "not set it, a random_uuid will be generated. This id is used " + "through out the inference process and return in response."), + ) # --8<-- [end:chat-embedding-extra-params] @model_validator(mode="before") diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 323795ca4372d..22c6b6250394c 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -113,7 +113,9 @@ class OpenAIServingCompletion(OpenAIServing): return self.create_error_response( "Echo is unsupported with prompt embeds.") - request_id = f"cmpl-{self._base_request_id(raw_request)}" + request_id = ( + f"cmpl-" + f"{self._base_request_id(raw_request, request.request_id)}") created_time = int(time.time()) request_metadata = RequestResponseMetadata(request_id=request_id) diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py index 697f43c018b27..84ba00873103d 100644 --- a/vllm/entrypoints/openai/serving_embedding.py +++ b/vllm/entrypoints/openai/serving_embedding.py @@ -163,8 +163,9 @@ class OpenAIServingEmbedding(EmbeddingMixin): for the API specification. This API mimics the OpenAI Embedding API. """ model_name = self._get_model_name(request.model) - request_id = (f"{self.request_id_prefix}-" - f"{self._base_request_id(raw_request)}") + request_id = ( + f"{self.request_id_prefix}-" + f"{self._base_request_id(raw_request, request.request_id)}") ctx = EmbeddingServeContext( request=request, From eab2f3980cd30132dd99e23ea8d6fc0274365757 Mon Sep 17 00:00:00 2001 From: Chih-Chieh Yang <7364402+cyang49@users.noreply.github.com> Date: Fri, 25 Jul 2025 09:49:36 -0400 Subject: [PATCH 08/57] [Model] Replace Mamba2 RMSNorm Gated with Fused Triton Kernel (#20839) Signed-off-by: Chih-Chieh-Yang <7364402+cyang49@users.noreply.github.com> Signed-off-by: Yu Chin Fabian Lim Signed-off-by: Chih-Chieh Yang <7364402+cyang49@users.noreply.github.com> Co-authored-by: Yu Chin Fabian Lim --- .../layers/mamba/mamba_mixer2.py | 21 +-- .../layers/mamba/ops/layernorm_gated.py | 168 ++++++++++++++++++ 2 files changed, 176 insertions(+), 13 deletions(-) create mode 100644 vllm/model_executor/layers/mamba/ops/layernorm_gated.py diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py index e32b2be4d40e7..2c95099e53ad6 100644 --- a/vllm/model_executor/layers/mamba/mamba_mixer2.py +++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py @@ -24,6 +24,7 @@ from vllm.model_executor.layers.mamba.mamba_utils import ( extra_groups_for_head_shards, get_mamba_state_shape) from vllm.model_executor.layers.mamba.ops.causal_conv1d import ( causal_conv1d_fn, causal_conv1d_update) +from vllm.model_executor.layers.mamba.ops.layernorm_gated import rms_norm_gated from vllm.model_executor.layers.mamba.ops.mamba_ssm import ( selective_state_update) from vllm.model_executor.layers.mamba.ops.ssd_combined import ( @@ -133,21 +134,15 @@ class Mixer2RMSNormGated(CustomOp): return x * nn.functional.silu(gate.to( torch.float32)).to(input_dtype) - if self.tp_size > 1 or self.n_groups != 1: + if (((self.n_groups % self.tp_size) != 0) or self.n_groups != 1): return self.forward_native(x, gate) - from vllm import _custom_ops as ops - - # cast x and gate to float32 before silu - out = torch.empty_like(x) - y = x * nn.functional.silu(gate.to(torch.float32)) - ops.rms_norm( - out, - y.to(x.dtype), - self.weight.data, - self.variance_epsilon, - ) - return out + return rms_norm_gated(x, + self.weight.data, + bias=None, + z=gate, + eps=self.variance_epsilon, + norm_before_gate=False) def mamba_v2_sharded_weight_loader( diff --git a/vllm/model_executor/layers/mamba/ops/layernorm_gated.py b/vllm/model_executor/layers/mamba/ops/layernorm_gated.py new file mode 100644 index 0000000000000..f3a45ab097c34 --- /dev/null +++ b/vllm/model_executor/layers/mamba/ops/layernorm_gated.py @@ -0,0 +1,168 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# Copyright (c) 2024, Tri Dao. +# Adapted from https://github.com/state-spaces/mamba/blob/60dadf2e0ee730ac337035d5533de10bc26e4847/mamba_ssm/ops/triton/layernorm_gated.py + +import torch + +from vllm.triton_utils import tl, triton + + +@triton.heuristics({"HAS_BIAS": lambda args: args["B"] is not None}) +@triton.heuristics({"HAS_Z": lambda args: args["Z"] is not None}) +@triton.jit +def _layer_norm_fwd_1pass_kernel( + X, # pointer to the input + Y, # pointer to the output + W, # pointer to the weights + B, # pointer to the biases + Z, # pointer to the other branch + Mean, # pointer to the mean + Rstd, # pointer to the 1/std + stride_x_row: tl.int64, + stride_y_row: tl.int64, + stride_z_row: tl.int64, + M: tl.int64, # number of rows in X + N: tl.int64, # number of columns in X + eps, # epsilon to avoid division by zero + BLOCK_N: tl.constexpr, + HAS_BIAS: tl.constexpr, + HAS_Z: tl.constexpr, + NORM_BEFORE_GATE: tl.constexpr, + IS_RMS_NORM: tl.constexpr, +): + # Map the program id to the row of X and Y it should compute. + row = tl.program_id(0) + group = tl.program_id(1) + X += row * stride_x_row + group * N + Y += row * stride_y_row + group * N + if HAS_Z: + Z += row * stride_z_row + group * N + if not IS_RMS_NORM: + Mean += group * M + Rstd += group * M + W += group * N + if HAS_BIAS: + B += group * N + # Compute mean and variance + cols = tl.arange(0, BLOCK_N) + x = tl.load(X + cols, mask=cols < N, other=0.).to(tl.float32) + if HAS_Z and not NORM_BEFORE_GATE: + z = tl.load(Z + cols, mask=cols < N).to(tl.float32) + x *= z * tl.sigmoid(z) + if not IS_RMS_NORM: + mean = tl.sum(x, axis=0) / N + tl.store(Mean + row, mean) + xbar = tl.where(cols < N, x - mean, 0.) + var = tl.sum(xbar * xbar, axis=0) / N + else: + xbar = tl.where(cols < N, x, 0.) + var = tl.sum(xbar * xbar, axis=0) / N + rstd = 1 / tl.sqrt(var + eps) + tl.store(Rstd + row, rstd) + # Normalize and apply linear transformation + mask = cols < N + w = tl.load(W + cols, mask=mask).to(tl.float32) + if HAS_BIAS: + b = tl.load(B + cols, mask=mask).to(tl.float32) + x_hat = (x - mean) * rstd if not IS_RMS_NORM else x * rstd + y = x_hat * w + b if HAS_BIAS else x_hat * w + if HAS_Z and NORM_BEFORE_GATE: + z = tl.load(Z + cols, mask=mask).to(tl.float32) + y *= z * tl.sigmoid(z) + # Write output + tl.store(Y + cols, y, mask=mask) + + +def _layer_norm_fwd(x, + weight, + bias, + eps, + z=None, + out=None, + group_size=None, + norm_before_gate=True, + is_rms_norm=False): + M, N = x.shape + if group_size is None: + group_size = N + assert N % group_size == 0 + ngroups = N // group_size + assert x.stride(-1) == 1 + if z is not None: + assert z.stride(-1) == 1 + assert z.shape == (M, N) + assert weight.shape == (N, ) + assert weight.stride(-1) == 1 + if bias is not None: + assert bias.stride(-1) == 1 + assert bias.shape == (N, ) + # allocate output + if out is not None: + assert out.shape == x.shape + else: + out = torch.empty_like(x) + assert out.stride(-1) == 1 + mean = torch.empty((ngroups * M, ), dtype=torch.float32, + device=x.device) if not is_rms_norm else None + rstd = torch.empty((ngroups * M, ), dtype=torch.float32, device=x.device) + # Less than 64KB per feature: enqueue fused kernel + MAX_FUSED_SIZE = 65536 // x.element_size() + BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(group_size)) + if group_size > BLOCK_N: + raise RuntimeError( + "This layer norm doesn't support feature dim >= 64KB.") + # heuristics for number of warps + num_warps = min(max(BLOCK_N // 256, 1), 8) + grid = (M, ngroups) + with torch.cuda.device(x.device.index): + _layer_norm_fwd_1pass_kernel[grid](x, + out, + weight, + bias, + z, + mean, + rstd, + x.stride(0), + out.stride(0), + z.stride(0) if z is not None else 0, + M, + group_size, + eps, + BLOCK_N=BLOCK_N, + NORM_BEFORE_GATE=norm_before_gate, + IS_RMS_NORM=is_rms_norm, + num_warps=num_warps) + return out, mean, rstd + + +def rms_norm_gated(x, + weight, + bias, + z=None, + eps=1e-6, + group_size=None, + norm_before_gate=True): + x_shape_og = x.shape + # reshape input data into 2D tensor + x = x.reshape(-1, x.shape[-1]) + if x.stride(-1) != 1: + x = x.contiguous() + if z is not None: + assert z.shape == x_shape_og + z = z.reshape(-1, z.shape[-1]) + if z.stride(-1) != 1: + z = z.contiguous() + weight = weight.contiguous() + if bias is not None: + bias = bias.contiguous() + y, _, _ = _layer_norm_fwd(x, + weight, + bias, + eps, + z=z, + group_size=group_size, + norm_before_gate=norm_before_gate, + is_rms_norm=True) + + return y.reshape(x_shape_og) From b3caeb82e7407d5faa30c49aecd951df3dafd42c Mon Sep 17 00:00:00 2001 From: who who who Date: Fri, 25 Jul 2025 21:50:21 +0800 Subject: [PATCH 09/57] [ROCm][AITER] Enable fp8 kv cache on rocm aiter backend. (#20295) Signed-off-by: fsx950223 Signed-off-by: amd-ruitang3 Co-authored-by: amd-ruitang3 --- .../attention/test_aiter_flash_attn.py | 191 +++++++++++++++ vllm/v1/attention/backends/rocm_aiter_fa.py | 225 ++++++++++-------- 2 files changed, 320 insertions(+), 96 deletions(-) create mode 100644 tests/kernels/attention/test_aiter_flash_attn.py diff --git a/tests/kernels/attention/test_aiter_flash_attn.py b/tests/kernels/attention/test_aiter_flash_attn.py new file mode 100644 index 0000000000000..d0687c62b1132 --- /dev/null +++ b/tests/kernels/attention/test_aiter_flash_attn.py @@ -0,0 +1,191 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from typing import Optional + +import pytest +import torch + +import vllm.v1.attention.backends.rocm_aiter_fa # noqa: F401 +from vllm.platforms import current_platform + +NUM_HEADS = [(4, 4), (8, 2), (16, 2)] +HEAD_SIZES = [128, 256] +BLOCK_SIZES = [16, 32] +DTYPES = [torch.float16, torch.bfloat16] +QDTYPES = [None] +# one value large enough to test overflow in index calculation. +# one value small enough to test the schema op check +NUM_BLOCKS = [32768, 2048] + + +def ref_paged_attn( + query: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + query_lens: list[int], + kv_lens: list[int], + block_tables: torch.Tensor, + scale: float, + sliding_window: Optional[int] = None, + soft_cap: Optional[float] = None, +) -> torch.Tensor: + num_seqs = len(query_lens) + block_tables = block_tables.cpu().numpy() + _, block_size, num_kv_heads, head_size = key_cache.shape + + outputs: list[torch.Tensor] = [] + start_idx = 0 + for i in range(num_seqs): + query_len = query_lens[i] + kv_len = kv_lens[i] + q = query[start_idx:start_idx + query_len] + q *= scale + + num_kv_blocks = (kv_len + block_size - 1) // block_size + block_indices = block_tables[i, :num_kv_blocks] + + k = key_cache[block_indices].view(-1, num_kv_heads, head_size) + k = k[:kv_len] + v = value_cache[block_indices].view(-1, num_kv_heads, head_size) + v = v[:kv_len] + + if q.shape[1] != k.shape[1]: + k = torch.repeat_interleave(k, q.shape[1] // k.shape[1], dim=1) + v = torch.repeat_interleave(v, q.shape[1] // v.shape[1], dim=1) + attn = torch.einsum("qhd,khd->hqk", q, k).float() + empty_mask = torch.ones(query_len, kv_len) + mask = torch.triu(empty_mask, diagonal=kv_len - query_len + 1).bool() + if sliding_window is not None: + sliding_window_mask = torch.triu(empty_mask, + diagonal=kv_len - + (query_len + sliding_window) + + 1).bool().logical_not() + mask |= sliding_window_mask + if soft_cap is not None: + attn = soft_cap * torch.tanh(attn / soft_cap) + attn.masked_fill_(mask, float("-inf")) + attn = torch.softmax(attn, dim=-1).to(v.dtype) + out = torch.einsum("hqk,khd->qhd", attn, v) + + outputs.append(out) + start_idx += query_len + + return torch.cat(outputs, dim=0) + + +@pytest.mark.skipif(not current_platform.is_rocm(), + reason="Only ROCm is supported") +@pytest.mark.parametrize("seq_lens", + [[(10, 1328), (5, 18), + (129, 463)], [(8, 523), (24, 37), (3, 2011)]]) +@pytest.mark.parametrize("num_heads", NUM_HEADS) +@pytest.mark.parametrize("head_size", HEAD_SIZES) +@pytest.mark.parametrize("block_size", BLOCK_SIZES) +@pytest.mark.parametrize("sliding_window", [None, 256]) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("soft_cap", [None]) +@pytest.mark.parametrize("num_blocks", NUM_BLOCKS) +@pytest.mark.parametrize("q_dtype", QDTYPES) +@torch.inference_mode() +def test_varlen_with_paged_kv( + seq_lens: list[tuple[int, int]], + num_heads: tuple[int, int], + head_size: int, + sliding_window: Optional[int], + dtype: torch.dtype, + block_size: int, + soft_cap: Optional[float], + num_blocks: int, + q_dtype: Optional[torch.dtype], +) -> None: + torch.set_default_device("cuda") + current_platform.seed_everything(0) + num_seqs = len(seq_lens) + query_lens = [x[0] for x in seq_lens] + kv_lens = [x[1] for x in seq_lens] + num_query_heads = num_heads[0] + num_kv_heads = num_heads[1] + assert num_query_heads % num_kv_heads == 0 + max_query_len = max(query_lens) + max_kv_len = max(kv_lens) + window_size = ((sliding_window - 1, 0) if sliding_window is not None else + (-1, -1)) + scale = head_size**-0.5 + + query = torch.randn(sum(query_lens), + num_query_heads, + head_size, + dtype=dtype) + key_cache = torch.randn(num_blocks, + block_size, + num_kv_heads, + head_size, + dtype=dtype) + value_cache = torch.randn_like(key_cache) + cu_query_lens = torch.tensor([0] + query_lens, + dtype=torch.int32).cumsum(dim=0, + dtype=torch.int32) + + cu_seq_lens = torch.tensor([0] + kv_lens, + dtype=torch.int32).cumsum(dim=0, + dtype=torch.int32) + kv_lens = torch.tensor(kv_lens, dtype=torch.int32) + + max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size + block_tables = torch.randint(0, + num_blocks, + (num_seqs, max_num_blocks_per_seq), + dtype=torch.int32) + + output = torch.empty_like(query) + + maybe_quantized_query = query + maybe_quantized_key_cache = key_cache + maybe_quantized_value_cache = value_cache + k_descale = None + v_descale = None + if q_dtype is not None: + # QKV are drawn from N(0, 1): no need for a fp8 scaling factor + maybe_quantized_query = query.to(q_dtype) + maybe_quantized_key_cache = key_cache.to(q_dtype) + maybe_quantized_value_cache = value_cache.to(q_dtype) + + scale_shape = (num_seqs, num_kv_heads) + k_descale = torch.ones(scale_shape, dtype=torch.float32) + v_descale = torch.ones(scale_shape, dtype=torch.float32) + + torch.ops.vllm.flash_attn_varlen_func( + maybe_quantized_query, + maybe_quantized_key_cache, + maybe_quantized_value_cache, + out=output, + cu_seqlens_q=cu_query_lens, + max_seqlen_q=max_query_len, + max_seqlen_k=max_kv_len, + softmax_scale=scale, + alibi_slopes=None, + window_size=window_size, + block_table=block_tables, + cu_seqlens_k=cu_seq_lens, + k_scale=k_descale, + v_scale=v_descale, + ) + + ref_output = ref_paged_attn( + query=query, + key_cache=key_cache, + value_cache=value_cache, + query_lens=query_lens, + kv_lens=kv_lens, + block_tables=block_tables, + scale=scale, + sliding_window=sliding_window, + soft_cap=soft_cap, + ) + + atol, rtol = 2e-2, 2e-2 + if q_dtype is not None: + atol, rtol = 1.5e-1, 1.5e-1 + torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol), \ + f"{torch.max(torch.abs(output - ref_output))}" diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py index 0739d2596676f..85a5dc8c91c13 100644 --- a/vllm/v1/attention/backends/rocm_aiter_fa.py +++ b/vllm/v1/attention/backends/rocm_aiter_fa.py @@ -2,20 +2,21 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Attention layer with AiterFlashAttention.""" from dataclasses import dataclass -from typing import Optional +from typing import ClassVar, Optional import torch -from vllm import _custom_ops as ops from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, - AttentionMetadata, AttentionType, - is_quantized_kv_cache) + AttentionMetadata, AttentionType) from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.platforms import current_platform -from vllm.v1.attention.backends.utils import CommonAttentionMetadata +from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder, + CommonAttentionMetadata) from vllm.v1.kv_cache_interface import AttentionSpec +_PARTITION_SIZE_ROCM = 256 + if current_platform.is_rocm(): import aiter @@ -32,38 +33,54 @@ if current_platform.is_rocm(): b_seq_lens_loc, block_table, block_table_stride_0, + k_scale, + v_scale, + output_dtype: tl.constexpr, E_DIM: tl.constexpr, BLOCK_SIZE: tl.constexpr, ): batch_idx = tl.program_id(0) block_idx = tl.program_id(1) - batch_token_indexes = tl.load(b_seq_lens_loc + batch_idx + - tl.arange(0, 2)) - batch_token_start, batch_token_end = tl.split(batch_token_indexes) - seq_len = batch_token_end - batch_token_start batch_query_indexes = tl.load(b_query_lens_loc + batch_idx + tl.arange(0, 2)) batch_query_start, batch_query_end = tl.split(batch_query_indexes) query_len = batch_query_end - batch_query_start + if query_len <= 1: return + + batch_token_indexes = tl.load(b_seq_lens_loc + batch_idx + + tl.arange(0, 2)) + batch_token_start, batch_token_end = tl.split(batch_token_indexes) + seq_len = batch_token_end - batch_token_start + if block_idx * BLOCK_SIZE < seq_len: block_mask = (block_idx * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)[:, None]) < seq_len kv_idx = tl.load(block_table + batch_idx * block_table_stride_0 + - block_idx) + block_idx).to(tl.int64) kv_buffer_off = kv_idx * BLOCK_SIZE * E_DIM + tl.arange( 0, BLOCK_SIZE)[:, None] * E_DIM + tl.arange(0, E_DIM)[None, :] k_vals = tl.load(k_buffer_ptr + kv_buffer_off, mask=block_mask, other=0.0) + if k_vals.dtype.is_fp8(): + k_vals = (k_vals.to(tl.float32) * + tl.load(k_scale)).to(output_dtype) + else: + k_vals = k_vals.to(output_dtype) + v_vals = tl.load(v_buffer_ptr + kv_buffer_off, mask=block_mask, other=0.0) - + if v_vals.dtype.is_fp8(): + v_vals = (v_vals.to(tl.float32) * + tl.load(v_scale)).to(output_dtype) + else: + v_vals = v_vals.to(output_dtype) kv_values_off = batch_token_start * E_DIM + \ block_idx * BLOCK_SIZE * E_DIM + \ tl.arange(0, BLOCK_SIZE)[:, None] * E_DIM + \ @@ -72,29 +89,44 @@ if current_platform.is_rocm(): tl.store(v_values_ptr + kv_values_off, v_vals, mask=block_mask) def vllm_layout_trans(b_query_lens_loc, b_seq_lens_loc, block_table, - k_buffer, v_buffer, max_seq_len, total_tokens): - H_KV = v_buffer.shape[2] - D = v_buffer.shape[3] - BLOCK_SIZE = v_buffer.shape[1] - dtype = k_buffer.dtype - k_values = torch.empty((total_tokens, H_KV, D), - dtype=dtype, - device="cuda") - v_values = torch.empty((total_tokens, H_KV, D), - dtype=dtype, - device="cuda") + k_cache, v_cache, max_seq_len, k_scale, v_scale, + output_dtype, total_tokens): + H_KV = v_cache.shape[2] + D = v_cache.shape[3] + BLOCK_SIZE = v_cache.shape[1] + + k_values = torch.empty( + (total_tokens, H_KV, D), + dtype=output_dtype, + device=k_cache.device, + ) + v_values = torch.empty( + (total_tokens, H_KV, D), + dtype=output_dtype, + device=v_cache.device, + ) grid = (block_table.shape[0], (max_seq_len + BLOCK_SIZE - 1) // BLOCK_SIZE) - _vllm_layout_trans_kernel[grid](k_buffer, - v_buffer, + if output_dtype == torch.float16: + output_dtype = tl.float16 + elif output_dtype == torch.bfloat16: + output_dtype = tl.bfloat16 + else: + raise ValueError(f"Unsupported output dtype: {output_dtype}") + + _vllm_layout_trans_kernel[grid](k_cache, + v_cache, k_values, v_values, b_query_lens_loc, b_seq_lens_loc, block_table, block_table.stride(0), + k_scale, + v_scale, + output_dtype=output_dtype, E_DIM=H_KV * D, BLOCK_SIZE=BLOCK_SIZE) @@ -107,16 +139,22 @@ if current_platform.is_rocm(): out: torch.Tensor, cu_seqlens_q: torch.Tensor, cu_seqlens_k: torch.Tensor, - total_tokens: int, max_seqlen_q: int, max_seqlen_k: int, softmax_scale: float, window_size: Optional[list[int]], # -1 means infinite context window alibi_slopes: Optional[list[float]], block_table: torch.Tensor, + k_scale: torch.Tensor, + v_scale: torch.Tensor, + total_tokens: int = 0, ) -> torch.Tensor: + if total_tokens == 0: + total_tokens = int(cu_seqlens_k[-1].item()) k, v = vllm_layout_trans(cu_seqlens_q, cu_seqlens_k, block_table, - k_cache, v_cache, max_seqlen_k, total_tokens) + k_cache, v_cache, max_seqlen_k, k_scale, + v_scale, q.dtype, total_tokens) + output = aiter.flash_attn_varlen_func( q=q, k=k, @@ -141,19 +179,21 @@ if current_platform.is_rocm(): out: torch.Tensor, cu_seqlens_q: torch.Tensor, cu_seqlens_k: torch.Tensor, - total_tokens: int, max_seqlen_q: int, max_seqlen_k: int, softmax_scale: float, window_size: Optional[list[int]], # -1 means infinite context window alibi_slopes: Optional[list[float]], block_table: torch.Tensor, + k_scale: torch.Tensor, + v_scale: torch.Tensor, + total_tokens: int = 0, ) -> torch.Tensor: return torch.empty(q.shape[0], q.shape[1], v_cache.shape[-2], - dtype=torch.float8_e4m3fnuz, - device="cuda") + dtype=q.dtype, + device=q.device) direct_register_custom_op("flash_attn_varlen_func", flash_attn_varlen_func_impl, ["out"], @@ -163,7 +203,33 @@ if current_platform.is_rocm(): logger = init_logger(__name__) -class AiterFlashAttentionMetadataBuilder: +@dataclass +class AiterFlashAttentionMetadata: + # NOTE(sang): Definition of context_len, query_len, and seq_len. + # |---------- N-1 iteration --------| + # |---------------- N iteration ---------------------| + # |- tokenA -|......................|-- newTokens ---| + # |---------- context_len ----------| + # |-------------------- seq_len ---------------------| + # |-- query_len ---| + + num_actual_tokens: int # Number of tokens excluding padding. + max_query_len: int + query_start_loc: torch.Tensor + max_seq_len: int + seq_lens: torch.Tensor + slot_mapping: torch.Tensor + block_table: torch.Tensor + + # For cascade attention. + use_cascade: bool + common_prefix_len: int + total_tokens: int + + +class AiterFlashAttentionMetadataBuilder( + AttentionMetadataBuilder[AiterFlashAttentionMetadata]): + full_cudagraph_supported: ClassVar[bool] = True def __init__(self, kv_cache_spec: AttentionSpec, vllm_config: VllmConfig, device: torch.device): @@ -180,14 +246,23 @@ class AiterFlashAttentionMetadataBuilder: self.headdim = self.model_config.get_head_size() self.block_size = kv_cache_spec.block_size self.kv_cache_spec = kv_cache_spec - # Sliding window size to be used with the AOT scheduler will be # populated on first build() call. self.aot_sliding_window: Optional[tuple[int, int]] = None + self.total_tokens: int = 0 def reorder_batch(self, input_batch, scheduler_output) -> bool: return False + def build_for_cudagraph_capture( + self, common_attn_metadata: CommonAttentionMetadata): + self.total_tokens = self.model_config.max_model_len \ + * self.vllm_config.scheduler_config.max_num_partial_prefills + res = self.build(common_prefix_len=0, + common_attn_metadata=common_attn_metadata) + self.total_tokens = 0 + return res + def build(self, common_prefix_len: int, common_attn_metadata: CommonAttentionMetadata, @@ -195,43 +270,29 @@ class AiterFlashAttentionMetadataBuilder: num_actual_tokens = common_attn_metadata.num_actual_tokens max_query_len = common_attn_metadata.max_query_len - max_seq_len = int(common_attn_metadata.seq_lens_cpu.max()) - total_tokens = int(common_attn_metadata.seq_lens_cpu.sum()) query_start_loc = common_attn_metadata.query_start_loc seq_lens = common_attn_metadata.seq_lens block_table_tensor = common_attn_metadata.block_table_tensor slot_mapping = common_attn_metadata.slot_mapping - cu_seq_lens = torch.zeros(seq_lens.shape[0] + 1, - dtype=torch.int32, - device=self.device) - torch.cumsum(seq_lens, - dim=0, - dtype=cu_seq_lens.dtype, - out=cu_seq_lens[1:]) + def schedule(batch_size, cu_query_lens, max_query_len, seqlens, + max_seq_len, causal): + return None use_cascade = common_prefix_len > 0 - cu_prefix_query_lens = None - prefix_kv_lens = None - suffix_kv_lens = None - attn_metadata = AiterFlashAttentionMetadata( num_actual_tokens=num_actual_tokens, max_query_len=max_query_len, query_start_loc=query_start_loc, max_seq_len=max_seq_len, seq_lens=seq_lens, - cu_seq_lens=cu_seq_lens, - total_tokens=total_tokens, block_table=block_table_tensor, slot_mapping=slot_mapping, use_cascade=use_cascade, common_prefix_len=common_prefix_len, - cu_prefix_query_lens=cu_prefix_query_lens, - prefix_kv_lens=prefix_kv_lens, - suffix_kv_lens=suffix_kv_lens, + total_tokens=self.total_tokens, ) return attn_metadata @@ -254,7 +315,7 @@ class AiterFlashAttentionBackend(AttentionBackend): @classmethod def get_supported_head_sizes(cls) -> list[int]: - return [32, 64, 96, 128, 160, 192, 224, 256] + return [64, 128, 256] @classmethod def validate_head_size(cls, head_size: int) -> None: @@ -295,34 +356,6 @@ class AiterFlashAttentionBackend(AttentionBackend): return (2, num_blocks, block_size, num_kv_heads, head_size) -@dataclass -class AiterFlashAttentionMetadata: - # NOTE(sang): Definition of context_len, query_len, and seq_len. - # |---------- N-1 iteration --------| - # |---------------- N iteration ---------------------| - # |- tokenA -|......................|-- newTokens ---| - # |---------- context_len ----------| - # |-------------------- seq_len ---------------------| - # |-- query_len ---| - - num_actual_tokens: int # Number of tokens excluding padding. - max_query_len: int - query_start_loc: torch.Tensor - max_seq_len: int - seq_lens: torch.Tensor - cu_seq_lens: torch.Tensor - total_tokens: int - block_table: torch.Tensor - slot_mapping: torch.Tensor - - # For cascade attention. - use_cascade: bool - common_prefix_len: int - cu_prefix_query_lens: Optional[torch.Tensor] - prefix_kv_lens: Optional[torch.Tensor] - suffix_kv_lens: Optional[torch.Tensor] - - class AiterFlashAttentionImpl(AttentionImpl): def __init__( @@ -366,10 +399,6 @@ class AiterFlashAttentionImpl(AttentionImpl): "encoder/decoder cross-attention " "are not implemented for " "FlashAttentionImpl") - if is_quantized_kv_cache(self.kv_cache_dtype): - raise NotImplementedError( - "AiterFlashAttention does not support fp8 kv-cache on this " - "device.") def forward( self, @@ -440,12 +469,6 @@ class AiterFlashAttentionImpl(AttentionImpl): if self.kv_cache_dtype.startswith("fp8"): key_cache = key_cache.view(torch.float8_e4m3fnuz) value_cache = value_cache.view(torch.float8_e4m3fnuz) - num_tokens, num_heads, head_size = query.shape - query, _ = ops.scaled_fp8_quant( - query.reshape( - (num_tokens, num_heads * head_size)).contiguous(), - layer._q_scale) - query = query.reshape((num_tokens, num_heads, head_size)) if not attn_metadata.use_cascade: cu_seqlens_q = attn_metadata.query_start_loc @@ -455,8 +478,16 @@ class AiterFlashAttentionImpl(AttentionImpl): block_table = attn_metadata.block_table if max_seqlen_q > 1: - cu_seq_lens = attn_metadata.cu_seq_lens - total_tokens = attn_metadata.total_tokens + + cu_seq_lens = torch.zeros(seqused_k.shape[0] + 1, + dtype=torch.int32, + device=query.device) + + torch.cumsum(seqused_k, + dim=0, + dtype=cu_seq_lens.dtype, + out=cu_seq_lens[1:]) + torch.ops.vllm.flash_attn_varlen_func( query[:num_actual_tokens], key_cache, @@ -465,29 +496,31 @@ class AiterFlashAttentionImpl(AttentionImpl): cu_seqlens_q=cu_seqlens_q, max_seqlen_q=max_seqlen_q, max_seqlen_k=max_seqlen_k, - total_tokens=total_tokens, softmax_scale=self.scale, alibi_slopes=self.alibi_slopes, window_size=self.sliding_window, block_table=block_table, - cu_seqlens_k=cu_seq_lens) + cu_seqlens_k=cu_seq_lens, + k_scale=layer._k_scale, + v_scale=layer._v_scale, + total_tokens=attn_metadata.total_tokens, + ) _, num_heads, head_size = query.shape - _PARTITION_SIZE_ROCM = 256 + nbytes_per_qo_elem = torch.finfo(query.dtype).bits // 8 num_seqs = seqused_k.shape[0] - nbyes_per_qo_elem = torch.finfo(output.dtype).bits // 8 max_num_partitions = (max_seqlen_k + _PARTITION_SIZE_ROCM - 1) // _PARTITION_SIZE_ROCM workspace_buffer = torch.empty( (num_seqs * num_heads * max_num_partitions * head_size) * - nbyes_per_qo_elem + 2 * + nbytes_per_qo_elem + 2 * (num_seqs * num_heads * max_num_partitions) * 4, dtype=torch.uint8, device=output.device, ) - aiter.paged_attention_v1( + torch.ops.aiter.paged_attention_v1( output[:num_actual_tokens], workspace_buffer, query[:num_actual_tokens], From 136d750f5f421ca5be2e24b0a913e813d99bb831 Mon Sep 17 00:00:00 2001 From: czhu-cohere Date: Fri, 25 Jul 2025 06:53:21 -0700 Subject: [PATCH 10/57] [Kernel] Improve machete memory bound perf (#21556) Signed-off-by: czhu-cohere --- csrc/quantization/machete/machete_prepacked_layout.cuh | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/csrc/quantization/machete/machete_prepacked_layout.cuh b/csrc/quantization/machete/machete_prepacked_layout.cuh index 81aaa6c4f3a28..4a7d6341e6c00 100644 --- a/csrc/quantization/machete/machete_prepacked_layout.cuh +++ b/csrc/quantization/machete/machete_prepacked_layout.cuh @@ -187,8 +187,12 @@ struct PrepackedLayoutBTemplate { CUTE_HOST_DEVICE static constexpr auto TVbNbKL_to_offset_copy( Shape_NKL shape_mkl) { auto layout = TVbNbKL_to_offset(shape_mkl); - return make_layout(coalesce(get<0>(layout)), get<1>(layout), - get<2>(layout)); + // for 4-bit elements, having >= 64 values per column + // allows TMA to load full 32-byte sectors + auto inner_layout = + make_layout(make_shape(_256{}, size<0>(layout) / _256{})); + + return make_layout(inner_layout, get<1>(layout), get<2>(layout)); } // ((BlockN, BlockK), (BlocksN, BlocksK), L) -> (storage_idx) From e189b50f53e333814d41278c5e5be66240c99018 Mon Sep 17 00:00:00 2001 From: mgazz Date: Fri, 25 Jul 2025 15:01:27 +0100 Subject: [PATCH 11/57] Add support for Prithvi in Online serving mode (#21518) Signed-off-by: Michele Gazzetti Co-authored-by: Cyrus Leung --- .../entrypoints/openai/test_skip_tokenizer.py | 93 +++++++++++++++++++ vllm/engine/multiprocessing/client.py | 20 ++-- vllm/entrypoints/openai/serving_engine.py | 14 ++- vllm/entrypoints/openai/serving_pooling.py | 6 +- .../models/prithvi_geospatial_mae.py | 5 +- 5 files changed, 128 insertions(+), 10 deletions(-) create mode 100644 tests/entrypoints/openai/test_skip_tokenizer.py diff --git a/tests/entrypoints/openai/test_skip_tokenizer.py b/tests/entrypoints/openai/test_skip_tokenizer.py new file mode 100644 index 0000000000000..32d28277e0ef8 --- /dev/null +++ b/tests/entrypoints/openai/test_skip_tokenizer.py @@ -0,0 +1,93 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import base64 +import io + +import numpy as np +import pytest +import requests +import torch + +from ...utils import RemoteOpenAIServer + +MODEL_NAME = "christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM" +DTYPE = "float16" + + +@pytest.fixture(autouse=True) +def v1(run_with_both_engines): + # Simple autouse wrapper to run both engines for each test + # This can be promoted up to conftest.py to run for every + # test in a package + pass + + +@pytest.fixture(scope="module") +def server(): + args = [ + "--task", + "embed", + # use half precision for speed and memory savings in CI environment + "--dtype", + DTYPE, + "--enforce-eager", + "--trust-remote-code", + "--skip-tokenizer-init", + "--max-num-seqs", + "32" + ] + + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: + yield remote_server + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_single_request(server: RemoteOpenAIServer, model_name: str): + + pixel_values = torch.full((6, 512, 512), 1.0, dtype=torch.float16) + location_coords = torch.full((1, 2), 1.0, dtype=torch.float16) + + buffer_tiff = io.BytesIO() + torch.save(pixel_values, buffer_tiff) + buffer_tiff.seek(0) + binary_data = buffer_tiff.read() + base64_tensor_embedding = base64.b64encode(binary_data).decode('utf-8') + + buffer_coord = io.BytesIO() + torch.save(location_coords, buffer_coord) + buffer_coord.seek(0) + binary_data = buffer_coord.read() + base64_coord_embedding = base64.b64encode(binary_data).decode('utf-8') + + prompt = { + "model": + model_name, + "additional_data": { + "prompt_token_ids": [1] + }, + "encoding_format": + "base64", + "messages": [{ + "role": + "user", + "content": [{ + "type": "image_embeds", + "image_embeds": { + "pixel_values": base64_tensor_embedding, + "location_coords": base64_coord_embedding, + }, + }], + }] + } + + # test single pooling + response = requests.post(server.url_for("pooling"), json=prompt) + response.raise_for_status() + + output = response.json()["data"][0]['data'] + + np_response = np.frombuffer(base64.b64decode(output), dtype=np.float32) + + assert len(np_response) == 524288 diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py index 67d9a3bf6ce20..cde8fc367fb54 100644 --- a/vllm/engine/multiprocessing/client.py +++ b/vllm/engine/multiprocessing/client.py @@ -97,11 +97,16 @@ class MQLLMEngineClient(EngineClient): self.model_config = engine_config.model_config self.decoding_config = engine_config.decoding_config - # Create the tokenizer group. - self.tokenizer = init_tokenizer_from_configs( - model_config=self.model_config, - scheduler_config=engine_config.scheduler_config, - lora_config=engine_config.lora_config) + if self.vllm_config.model_config.skip_tokenizer_init: + self.tokenizer = None + + else: + # Create the tokenizer group. + self.tokenizer = init_tokenizer_from_configs( + model_config=self.model_config, + scheduler_config=engine_config.scheduler_config, + lora_config=engine_config.lora_config) + self.input_preprocessor = InputPreprocessor(self.model_config, self.tokenizer) @@ -375,7 +380,10 @@ class MQLLMEngineClient(EngineClient): return self.input_preprocessor async def get_tokenizer(self, lora_request: Optional[LoRARequest] = None): - return await self.tokenizer.get_lora_tokenizer_async(lora_request) + if self.tokenizer is None: + return None + else: + return await self.tokenizer.get_lora_tokenizer_async(lora_request) async def get_vllm_config(self) -> VllmConfig: return self.vllm_config diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index edc366f9b8a88..9d848679d5d98 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -880,7 +880,10 @@ class OpenAIServing: _chat_template_kwargs.update(chat_template_kwargs or {}) request_prompt: Union[str, list[int]] - if isinstance(tokenizer, MistralTokenizer): + + if tokenizer is None: + request_prompt = "placeholder" + elif isinstance(tokenizer, MistralTokenizer): request_prompt = apply_mistral_chat_template( tokenizer, messages=messages, @@ -910,7 +913,14 @@ class OpenAIServing: request = tool_parser(tokenizer).adjust_request( # type: ignore request=request) - if isinstance(request_prompt, str): + if tokenizer is None: + assert isinstance(request_prompt, str), ( + "Prompt has to be a string", \ + "when the tokenizer is not initialised" + ) + prompt_inputs = TextTokensPrompt(prompt=request_prompt, + prompt_token_ids=[1]) + elif isinstance(request_prompt, str): prompt_inputs = await self._tokenize_prompt_input_async( request, tokenizer, diff --git a/vllm/entrypoints/openai/serving_pooling.py b/vllm/entrypoints/openai/serving_pooling.py index 12334cdac365a..38745d001ade6 100644 --- a/vllm/entrypoints/openai/serving_pooling.py +++ b/vllm/entrypoints/openai/serving_pooling.py @@ -96,7 +96,11 @@ class OpenAIServingPooling(OpenAIServing): self.max_model_len, truncate_prompt_tokens) lora_request = self._maybe_get_adapters(request) - tokenizer = await self.engine_client.get_tokenizer(lora_request) + if self.model_config.skip_tokenizer_init: + tokenizer = None + else: + tokenizer = await self.engine_client.get_tokenizer(lora_request + ) if isinstance(request, PoolingChatRequest): ( diff --git a/vllm/model_executor/models/prithvi_geospatial_mae.py b/vllm/model_executor/models/prithvi_geospatial_mae.py index 0f00fd47fe4fc..304a9e987ee03 100644 --- a/vllm/model_executor/models/prithvi_geospatial_mae.py +++ b/vllm/model_executor/models/prithvi_geospatial_mae.py @@ -103,7 +103,10 @@ class PrithviGeoSpatialMAEMultiModalProcessor(BaseMultiModalProcessor): mm_kwargs = {} for k, v in mm_data.items(): - mm_kwargs[k] = v + if isinstance(v, dict) and k == "image": + mm_kwargs.update(v) + else: + mm_kwargs[k] = v mm_placeholders = {"image": [PlaceholderRange(offset=0, length=0)]} # This model receives in input a multi-dimensional tensor representing From 396ee941803d0382603f567bba41116bb3d04dda Mon Sep 17 00:00:00 2001 From: Kebe Date: Fri, 25 Jul 2025 22:33:56 +0800 Subject: [PATCH 12/57] [CI] Unifying Dockerfiles for ARM and X86 Builds (#21343) Signed-off-by: Kebe --- .github/workflows/lint-and-deploy.yaml | 2 +- docker/Dockerfile.arm | 62 ------------------- docker/Dockerfile.cpu | 24 ++++++- .../installation/cpu/arm.inc.md | 2 +- requirements/cpu.txt | 6 +- 5 files changed, 29 insertions(+), 67 deletions(-) delete mode 100644 docker/Dockerfile.arm diff --git a/.github/workflows/lint-and-deploy.yaml b/.github/workflows/lint-and-deploy.yaml index 74a7a3a3530f5..d5736c0aee208 100644 --- a/.github/workflows/lint-and-deploy.yaml +++ b/.github/workflows/lint-and-deploy.yaml @@ -7,7 +7,7 @@ permissions: jobs: lint-and-deploy: - runs-on: ubuntu-latest + runs-on: ubuntu-24.04-arm steps: - name: Checkout uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 diff --git a/docker/Dockerfile.arm b/docker/Dockerfile.arm deleted file mode 100644 index bad093684239c..0000000000000 --- a/docker/Dockerfile.arm +++ /dev/null @@ -1,62 +0,0 @@ -# This vLLM Dockerfile is used to construct an image that can build and run vLLM on ARM CPU platform. - -FROM ubuntu:22.04 AS cpu-test-arm - -ENV CCACHE_DIR=/root/.cache/ccache - -ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache - -RUN --mount=type=cache,target=/var/cache/apt \ - apt-get update -y \ - && apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \ - && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \ - && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 - -# tcmalloc provides better memory allocation efficiency, e.g., holding memory in caches to speed up access of commonly-used objects. -RUN --mount=type=cache,target=/root/.cache/pip \ - pip install py-cpuinfo # Use this to gather CPU info and optimize based on ARM Neoverse cores - -# Set LD_PRELOAD for tcmalloc on ARM -ENV LD_PRELOAD="/usr/lib/aarch64-linux-gnu/libtcmalloc_minimal.so.4" - -RUN echo 'ulimit -c 0' >> ~/.bashrc - -WORKDIR /workspace - -ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" -ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL} -RUN --mount=type=cache,target=/root/.cache/pip \ - --mount=type=bind,src=requirements/build.txt,target=requirements/build.txt \ - pip install --upgrade pip && \ - pip install -r requirements/build.txt - -FROM cpu-test-arm AS build - -WORKDIR /workspace/vllm - -RUN --mount=type=cache,target=/root/.cache/pip \ - --mount=type=bind,src=requirements/common.txt,target=requirements/common.txt \ - --mount=type=bind,src=requirements/cpu.txt,target=requirements/cpu.txt \ - pip install -v -r requirements/cpu.txt - -COPY . . -ARG GIT_REPO_CHECK=0 -RUN --mount=type=bind,source=.git,target=.git \ - if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi - -# Disabling AVX512 specific optimizations for ARM -ARG VLLM_CPU_DISABLE_AVX512="true" -ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512} - -RUN --mount=type=cache,target=/root/.cache/pip \ - --mount=type=cache,target=/root/.cache/ccache \ - --mount=type=bind,source=.git,target=.git \ - VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \ - pip install dist/*.whl && \ - rm -rf dist - -WORKDIR /workspace/ - -RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks - -ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] \ No newline at end of file diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu index 982c1ddf27438..5e49e87131ece 100644 --- a/docker/Dockerfile.cpu +++ b/docker/Dockerfile.cpu @@ -1,4 +1,11 @@ -# This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform. +# This vLLM Dockerfile is used to build images that can run vLLM on both x86_64 and arm64 CPU platforms. +# +# Supported platforms: +# - linux/amd64 (x86_64) +# - linux/arm64 (aarch64) +# +# Use the `--platform` option with `docker buildx build` to specify the target architecture, e.g.: +# docker buildx build --platform=linux/arm64 -f docker/Dockerfile.cpu . # # Build targets: # vllm-openai (default): used for serving deployment @@ -53,7 +60,20 @@ RUN --mount=type=cache,target=/root/.cache/uv \ uv pip install --upgrade pip && \ uv pip install -r requirements/cpu.txt -ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/opt/venv/lib/libiomp5.so:$LD_PRELOAD" +ARG TARGETARCH +ENV TARGETARCH=${TARGETARCH} + +RUN if [ "$TARGETARCH" = "arm64" ]; then \ + PRELOAD_PATH="/usr/lib/aarch64-linux-gnu/libtcmalloc_minimal.so.4"; \ + else \ + PRELOAD_PATH="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/opt/venv/lib/libiomp5.so"; \ + fi && \ + echo "export LD_PRELOAD=$PRELOAD_PATH" >> ~/.bashrc + +# Ensure that the LD_PRELOAD environment variable for export is in effect. +SHELL ["/bin/bash", "-c"] + +ENV LD_PRELOAD=${LD_PRELOAD} RUN echo 'ulimit -c 0' >> ~/.bashrc diff --git a/docs/getting_started/installation/cpu/arm.inc.md b/docs/getting_started/installation/cpu/arm.inc.md index 63ae351b395fb..cac578eefb1d7 100644 --- a/docs/getting_started/installation/cpu/arm.inc.md +++ b/docs/getting_started/installation/cpu/arm.inc.md @@ -33,7 +33,7 @@ Testing has been conducted on AWS Graviton3 instances for compatibility. # --8<-- [end:pre-built-images] # --8<-- [start:build-image-from-source] ```bash -docker build -f docker/Dockerfile.arm \ +docker build -f docker/Dockerfile.cpu \ --tag vllm-cpu-env . # Launching OpenAI server diff --git a/requirements/cpu.txt b/requirements/cpu.txt index d80354342bc20..6860275acab6f 100644 --- a/requirements/cpu.txt +++ b/requirements/cpu.txt @@ -10,7 +10,8 @@ setuptools>=77.0.3,<80.0.0 --extra-index-url https://download.pytorch.org/whl/cpu torch==2.6.0+cpu; platform_machine == "x86_64" # torch>2.6.0+cpu has performance regression on x86 platform, see https://github.com/pytorch/pytorch/pull/151218 torch==2.7.0; platform_system == "Darwin" -torch==2.7.0; platform_machine == "ppc64le" or platform_machine == "aarch64" +torch==2.7.0; platform_machine == "ppc64le" +torch==2.6.0; platform_machine == "aarch64" # for arm64 CPUs, torch 2.7.0 has a issue: https://github.com/vllm-project/vllm/issues/17960 # required for the image processor of minicpm-o-2_6, this must be updated alongside torch torchaudio; platform_machine != "ppc64le" and platform_machine != "s390x" @@ -25,3 +26,6 @@ datasets # for benchmark scripts intel-openmp==2024.2.1; platform_machine == "x86_64" intel_extension_for_pytorch==2.6.0; platform_machine == "x86_64" # torch>2.6.0+cpu has performance regression on x86 platform, see https://github.com/pytorch/pytorch/pull/151218 triton==3.2.0; platform_machine == "x86_64" # Triton is required for torch 2.6+cpu, as it is imported in torch.compile. + +# Use this to gather CPU info and optimize based on ARM Neoverse cores +py-cpuinfo; platform_machine == "aarch64" From 5ac3168ee342f4cae17b0b67375e647bd5dd9151 Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Fri, 25 Jul 2025 23:52:42 +0800 Subject: [PATCH 13/57] [Docs] add auto-round quantization readme (#21600) Signed-off-by: Wenhua Cheng Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- docs/features/quantization/README.md | 1 + docs/features/quantization/auto_round.md | 103 +++++++++++++++++++++++ 2 files changed, 104 insertions(+) create mode 100644 docs/features/quantization/auto_round.md diff --git a/docs/features/quantization/README.md b/docs/features/quantization/README.md index e8c3b11230786..e18c128f30fc9 100644 --- a/docs/features/quantization/README.md +++ b/docs/features/quantization/README.md @@ -6,6 +6,7 @@ Contents: - [Supported Hardware](supported_hardware.md) - [AutoAWQ](auto_awq.md) +- [AutoRound](auto_round.md) - [BitsAndBytes](bnb.md) - [BitBLAS](bitblas.md) - [GGUF](gguf.md) diff --git a/docs/features/quantization/auto_round.md b/docs/features/quantization/auto_round.md new file mode 100644 index 0000000000000..2dfd847bb7d9a --- /dev/null +++ b/docs/features/quantization/auto_round.md @@ -0,0 +1,103 @@ +# AutoRound + +[AutoRound](https://github.com/intel/auto-round) is Intel’s advanced quantization algorithm designed to produce highly efficient **INT2, INT3, INT4, and INT8** +quantized large language models—striking an optimal balance between accuracy and deployment performance. + +AutoRound applies weight-only quantization to transformer-based models, enabling significant memory savings and faster +inference while maintaining near-original accuracy. It supports a wide range of hardware platforms, including **CPUs, +Intel GPUs, HPUs, and CUDA-enabled devices**. + +Please refer to the [AutoRound guide](https://github.com/intel/auto-round/blob/main/docs/step_by_step.md) for more details. + +Key Features: + +✅ **AutoRound, AutoAWQ, AutoGPTQ, and GGUF** are supported + +✅ **10+ vision-language models (VLMs)** are supported + +✅ **Per-layer mixed-bit quantization** for fine-grained control + +✅ **RTN (Round-To-Nearest) mode** for quick quantization with slight accuracy loss + +✅ **Multiple quantization recipes**: best, base, and light + +✅ Advanced utilities such as immediate packing and support for **10+ backends** + +## Installation + +```bash +uv pip install auto-round +``` + +## Quantizing a model + +For VLMs, please change to `auto-round-mllm` in CLI usage and `AutoRoundMLLM` in API usage. + +### CLI usage + +```bash +auto-round \ + --model Qwen/Qwen3-0.6B \ + --bits 4 \ + --group_size 128 \ + --format "auto_round" \ + --output_dir ./tmp_autoround +``` + +```bash +auto-round \ + --model Qwen/Qwen3-0.6B \ + --format "gguf:q4_k_m" \ + --output_dir ./tmp_autoround +``` + +### API usage + +```python +from transformers import AutoModelForCausalLM, AutoTokenizer +from auto_round import AutoRound + +model_name = "Qwen/Qwen3-0.6B" +model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto") +tokenizer = AutoTokenizer.from_pretrained(model_name) + +bits, group_size, sym = 4, 128, True +autoround = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym) + +# the best accuracy, 4-5X slower, low_gpu_mem_usage could save ~20G but ~30% slower +# autoround = AutoRound(model, tokenizer, nsamples=512, iters=1000, low_gpu_mem_usage=True, bits=bits, group_size=group_size, sym=sym) + +# 2-3X speedup, slight accuracy drop at W4G128 +# autoround = AutoRound(model, tokenizer, nsamples=128, iters=50, lr=5e-3, bits=bits, group_size=group_size, sym=sym ) + +output_dir = "./tmp_autoround" +# format= 'auto_round'(default), 'auto_gptq', 'auto_awq' +autoround.quantize_and_save(output_dir, format="auto_round") +``` + +## Running a quantized model with vLLM + +Here is some example code to run auto-round format in vLLM: + +```python +from vllm import LLM, SamplingParams + +prompts = [ + "Hello, my name is", +] +sampling_params = SamplingParams(temperature=0.6, top_p=0.95) +model_name = "Intel/DeepSeek-R1-0528-Qwen3-8B-int4-AutoRound" +llm = LLM(model=model_name) + +outputs = llm.generate(prompts, sampling_params) + +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") +``` + +# Acknowledgement + +Special thanks to open-source low precision libraries such as AutoGPTQ, AutoAWQ, GPTQModel, Triton, Marlin, and +ExLLaMAV2 for providing low-precision CUDA kernels, which are leveraged in AutoRound. From 7cfea0df390c154c1026f77d3682e2733ca4aca8 Mon Sep 17 00:00:00 2001 From: QiliangCui Date: Fri, 25 Jul 2025 13:22:01 -0700 Subject: [PATCH 14/57] [TPU][Test] Rollback PR-21550. (#21619) Signed-off-by: Qiliang Cui --- tests/v1/tpu/test_basic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/v1/tpu/test_basic.py b/tests/v1/tpu/test_basic.py index dd89059ded524..865b58bc7f4b0 100644 --- a/tests/v1/tpu/test_basic.py +++ b/tests/v1/tpu/test_basic.py @@ -59,7 +59,7 @@ def test_basic( # actually test chunked prompt max_num_batched_tokens=1024, max_model_len=8192, - gpu_memory_utilization=0.95, + gpu_memory_utilization=0.7, max_num_seqs=max_num_seqs, tensor_parallel_size=tensor_parallel_size) as vllm_model: vllm_outputs = vllm_model.generate_greedy(example_prompts, From 41d3082c416897092bc924bc341e86b3e49728ee Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Fri, 25 Jul 2025 17:06:48 -0700 Subject: [PATCH 15/57] Add Unsloth to RLHF.md (#21636) --- docs/training/rlhf.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/docs/training/rlhf.md b/docs/training/rlhf.md index 4f75e4e01495c..f608a630ab7a5 100644 --- a/docs/training/rlhf.md +++ b/docs/training/rlhf.md @@ -2,10 +2,14 @@ Reinforcement Learning from Human Feedback (RLHF) is a technique that fine-tunes language models using human-generated preference data to align model outputs with desired behaviors. -vLLM can be used to generate the completions for RLHF. The best way to do this is with libraries like [TRL](https://github.com/huggingface/trl), [OpenRLHF](https://github.com/OpenRLHF/OpenRLHF) and [verl](https://github.com/volcengine/verl). +vLLM can be used to generate the completions for RLHF. Some ways to do this include using libraries like [TRL](https://github.com/huggingface/trl), [OpenRLHF](https://github.com/OpenRLHF/OpenRLHF), [verl](https://github.com/volcengine/verl) and [unsloth](https://github.com/unslothai/unsloth). See the following basic examples to get started if you don't want to use an existing library: - [Training and inference processes are located on separate GPUs (inspired by OpenRLHF)](../examples/offline_inference/rlhf.md) - [Training and inference processes are colocated on the same GPUs using Ray](../examples/offline_inference/rlhf_colocate.md) - [Utilities for performing RLHF with vLLM](../examples/offline_inference/rlhf_utils.md) + +See the following notebooks showing how to use vLLM for GRPO: + +- [Qwen-3 4B GRPO using Unsloth + vLLM](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen3_(4B)-GRPO.ipynb) From 75d29cf4e1d7e950c2308b12e944b507fb3e1916 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Fri, 25 Jul 2025 20:07:07 -0400 Subject: [PATCH 16/57] [Perf] Cuda Kernel for Int8 Per Token Group Quant (#21476) Signed-off-by: yewentao256 --- csrc/ops.h | 5 +++++ .../compressed_tensors/int8_quant_kernels.cu | 10 ++++++++++ csrc/quantization/fp8/per_token_group_quant.cu | 6 +++++- csrc/quantization/per_token_group_quant_8bit.h | 10 ++++++++++ csrc/torch_bindings.cpp | 8 ++++++++ .../layers/quantization/utils/int8_utils.py | 11 +++++++++-- 6 files changed, 47 insertions(+), 3 deletions(-) create mode 100644 csrc/quantization/per_token_group_quant_8bit.h diff --git a/csrc/ops.h b/csrc/ops.h index 97a247d9d628c..207291eceb169 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -292,6 +292,11 @@ void per_token_group_quant_fp8(const torch::Tensor& input, torch::Tensor& output_q, torch::Tensor& output_s, int64_t group_size, double eps, double fp8_min, double fp8_max, bool scale_ue8m0); + +void per_token_group_quant_int8(const torch::Tensor& input, + torch::Tensor& output_q, + torch::Tensor& output_s, int64_t group_size, + double eps, double int8_min, double int8_max); #endif void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input, diff --git a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu index 5cd2ac179768b..6a81f159f46ae 100644 --- a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu +++ b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu @@ -1,6 +1,8 @@ #include #include +#include "../per_token_group_quant_8bit.h" + #include #include "../../dispatch_utils.h" @@ -336,3 +338,11 @@ void dynamic_scaled_int8_quant( } }); } + +void per_token_group_quant_int8(const torch::Tensor& input, + torch::Tensor& output_q, + torch::Tensor& output_s, int64_t group_size, + double eps, double int8_min, double int8_max) { + per_token_group_quant_8bit(input, output_q, output_s, group_size, eps, + int8_min, int8_max); +} \ No newline at end of file diff --git a/csrc/quantization/fp8/per_token_group_quant.cu b/csrc/quantization/fp8/per_token_group_quant.cu index afc41faeca902..2609054f2072b 100644 --- a/csrc/quantization/fp8/per_token_group_quant.cu +++ b/csrc/quantization/fp8/per_token_group_quant.cu @@ -1,6 +1,8 @@ #include #include +#include "../per_token_group_quant_8bit.h" + #include #include @@ -120,7 +122,7 @@ void per_token_group_quant_8bit(const torch::Tensor& input, torch::Tensor& output_q, torch::Tensor& output_s, int64_t group_size, double eps, double min_8bit, double max_8bit, - bool scale_ue8m0 = false) { + bool scale_ue8m0) { TORCH_CHECK(input.is_contiguous()); TORCH_CHECK(output_q.is_contiguous()); @@ -198,6 +200,8 @@ void per_token_group_quant_8bit(const torch::Tensor& input, input.scalar_type(), "per_token_group_quant_8bit", ([&] { if (dst_type == at::ScalarType::Float8_e4m3fn) { LAUNCH_KERNEL(scalar_t, c10::Float8_e4m3fn); + } else if (dst_type == at::ScalarType::Char) { + LAUNCH_KERNEL(scalar_t, int8_t); } })); diff --git a/csrc/quantization/per_token_group_quant_8bit.h b/csrc/quantization/per_token_group_quant_8bit.h new file mode 100644 index 0000000000000..537b61bc4303f --- /dev/null +++ b/csrc/quantization/per_token_group_quant_8bit.h @@ -0,0 +1,10 @@ +#pragma once +#include + +// TODO(wentao): refactor the folder to 8bit, then includes fp8 and int8 folders +// 8-bit per-token-group quantization helper used by both FP8 and INT8 +void per_token_group_quant_8bit(const torch::Tensor& input, + torch::Tensor& output_q, + torch::Tensor& output_s, int64_t group_size, + double eps, double min_8bit, double max_8bit, + bool scale_ue8m0 = false); \ No newline at end of file diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index 95f8541bc9e2d..85b6abef00b03 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -624,6 +624,14 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ops.impl("per_token_group_fp8_quant", torch::kCUDA, &per_token_group_quant_fp8); + // Compute per-token-group INT8 quantized tensor and scaling factor. + ops.def( + "per_token_group_quant_int8(Tensor input, Tensor! output_q, Tensor! " + "output_s, int group_size, float eps, float int8_min, float int8_max) -> " + "()"); + ops.impl("per_token_group_quant_int8", torch::kCUDA, + &per_token_group_quant_int8); + // reorder weight for AllSpark Ampere W8A16 Fused Gemm kernel ops.def( "rearrange_kn_weight_as_n32k16_order(Tensor b_qweight, Tensor b_scales, " diff --git a/vllm/model_executor/layers/quantization/utils/int8_utils.py b/vllm/model_executor/layers/quantization/utils/int8_utils.py index 1fdf7d174e25e..6840cabbf1ae3 100644 --- a/vllm/model_executor/layers/quantization/utils/int8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/int8_utils.py @@ -238,13 +238,20 @@ def per_token_group_quant_int8( int8_min = iinfo.min x_q = torch.empty_like(x, device=x.device, dtype=dtype) - M = x.numel() // group_size - N = group_size x_s = torch.empty( x.shape[:-1] + (x.shape[-1] // group_size, ), device=x.device, dtype=torch.float32, ) + # prefer CUDA kernel if available + if current_platform.is_cuda(): + torch.ops._C.per_token_group_quant_int8(x, x_q, x_s, group_size, eps, + float(int8_min), + float(int8_max)) + return x_q, x_s + + M = x.numel() // group_size + N = group_size BLOCK = triton.next_power_of_2(N) # heuristics for number of warps From 2eddd437ba5e7ce80d7341bf87a3078802b01ba7 Mon Sep 17 00:00:00 2001 From: Yong Hoon Shin <48474650+sarckk@users.noreply.github.com> Date: Fri, 25 Jul 2025 17:07:26 -0700 Subject: [PATCH 17/57] Add interleaved RoPE test for Llama4 (Maverick) (#21478) Signed-off-by: Yong Hoon Shin --- .../multimodal/generation/test_maverick.py | 92 +++++++++++++++---- 1 file changed, 73 insertions(+), 19 deletions(-) diff --git a/tests/models/multimodal/generation/test_maverick.py b/tests/models/multimodal/generation/test_maverick.py index 306cf39002df2..bacc9ef94f49d 100644 --- a/tests/models/multimodal/generation/test_maverick.py +++ b/tests/models/multimodal/generation/test_maverick.py @@ -22,6 +22,9 @@ from transformers import (AutoConfig, AutoProcessor, AutoTokenizer, GenerationConfig) from vllm import LLM, SamplingParams +from vllm.v1.executor.abstract import Executor +from vllm.v1.kv_cache_interface import (ChunkedLocalAttentionSpec, + FullAttentionSpec) from ....utils import multi_gpu_test @@ -69,6 +72,26 @@ def run_maverick_serving(model: str): raise +def get_rope_layers_config(model_path: str) -> list[int]: + """ + Get the interleaved RoPE configuration from HuggingFace config + + Args: + model_path: Path to the local directory containing the reduced + Maverick model checkpoint + + Returns: + List of 0 or 1 indicating whether each layer uses RoPE and local attn + 0 indicates that RoPE is not used while 1 indicates that RoPE is used. + """ + config_path = Path(model_path) / "config.json" + model_config = json.loads(config_path.read_text()) + text_config = model_config["text_config"] + no_rope_layers = text_config["no_rope_layers"] + print(f"Found no_rope_layers: {no_rope_layers}") + return no_rope_layers + + def create_reduced_maverick_model( original_model_name: str = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", @@ -113,7 +136,6 @@ def create_reduced_maverick_model( print("Loading original model configuration...") original_config = AutoConfig.from_pretrained(original_model_name, trust_remote_code=True) - print("Creating reduced configuration...") reduced_config = create_reduced_config(original_config, text_layers, num_experts, vision_layers) @@ -510,21 +532,32 @@ def save_weights_to_safetensors(weights: dict[str, torch.Tensor], f"{index_data['metadata']['total_size'] / (1024**3):.2f} GB") -def run_reduced_model(model_path: str, - should_profile: bool = False, - **kwargs) -> None: - """Test the created reduced model with vLLM.""" - - print(f"\nTesting reduced model at {model_path}...") - - llm = LLM( - model=model_path, - trust_remote_code=True, - max_model_len=512, # Small context for testing - gpu_memory_utilization=0.3, # Conservative memory usage - **kwargs, +def check_attention_spec_interleaved_rope( + llm: LLM, + num_attention_layers: int, + num_ranks: int, + rope_layers: list[int], +): + """Check that the attention spec is correct.""" + assert isinstance(llm.llm_engine.model_executor, Executor) + kv_cache_specs_per_rank = llm.llm_engine.model_executor.get_kv_cache_specs( ) + for rank in range(num_ranks): + kv_cache_specs = kv_cache_specs_per_rank[rank] + assert len(kv_cache_specs.keys()) == num_attention_layers + for i in range(num_attention_layers): + if rope_layers[i] == 0: + expected_spec = FullAttentionSpec + else: + expected_spec = ChunkedLocalAttentionSpec + assert isinstance( + kv_cache_specs[ + f"language_model.model.layers.{i}.self_attn.attn"], + expected_spec) + +def run_reduced_model(llm: LLM, should_profile: bool = False) -> None: + """Test the created reduced model with vLLM.""" sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=50) @@ -551,6 +584,7 @@ def run_reduced_model(model_path: str, @pytest.mark.parametrize("tp,ep", [(2, True)]) @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") def test_dummy_maverick( + monkeypatch, original_model_name: str, text_layers: int, num_experts: int, @@ -562,6 +596,10 @@ def test_dummy_maverick( force_recreate: bool = True, profile: bool = False, ) -> None: + # Disable multiprocessing allows us to access model executor from LLM engine + monkeypatch.setenv("VLLM_USE_V1", "1") + monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0") + model_path = create_reduced_maverick_model( original_model_name=original_model_name, output_dir=output_dir, @@ -573,11 +611,27 @@ def test_dummy_maverick( print(f"\nReduced model created successfully at: {model_path}") - run_reduced_model(model_path=model_path, - should_profile=profile, - enforce_eager=enforce_eager, - tensor_parallel_size=tp, - enable_expert_parallel=ep) + rope_layers = get_rope_layers_config(model_path) + + llm = LLM( + model=model_path, + trust_remote_code=True, + max_model_len=512, # Small context for testing + gpu_memory_utilization=0.3, # Conservative memory usage + enforce_eager=enforce_eager, + tensor_parallel_size=tp, + enable_expert_parallel=ep, + ) + + check_attention_spec_interleaved_rope( + llm, + text_layers, + tp, + rope_layers, + ) + + print(f"\nTesting reduced model at {model_path}...") + run_reduced_model(llm=llm, should_profile=profile) def main(): From cea96a015678c86789fa86a719ce7d6d176d78fd Mon Sep 17 00:00:00 2001 From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com> Date: Fri, 25 Jul 2025 17:07:58 -0700 Subject: [PATCH 18/57] [Bugfix] Fix sync_and_slice_intermediate_tensors (#21537) Signed-off-by: Rui Qiao --- vllm/v1/worker/gpu_model_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 5fe594db667a5..6ddb2c422dff7 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1270,7 +1270,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): if sync_self: assert intermediate_tensors is not None for k, v in intermediate_tensors.items(): - is_scattered = "residual" and is_residual_scattered + is_scattered = k == "residual" and is_residual_scattered copy_len = num_tokens // tp if is_scattered else \ num_tokens self.intermediate_tensors[k][:copy_len].copy_( From c7742d61134783b50098ab249f6815051a4c4a2a Mon Sep 17 00:00:00 2001 From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com> Date: Fri, 25 Jul 2025 17:08:30 -0700 Subject: [PATCH 19/57] [Bugfix] Always set RAY_ADDRESS for Ray actor before spawn (#21540) Signed-off-by: Rui Qiao --- vllm/utils/__init__.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index 9f4140ac64e2f..054037b8932b7 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -2883,26 +2883,27 @@ def _maybe_force_spawn(): if os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") == "spawn": return - reason = None - if cuda_is_initialized(): - reason = "CUDA is initialized" - elif xpu_is_initialized(): - reason = "XPU is initialized" - elif is_in_ray_actor(): + reasons = [] + if is_in_ray_actor(): # even if we choose to spawn, we need to pass the ray address # to the subprocess so that it knows how to connect to the ray cluster. # env vars are inherited by subprocesses, even if we use spawn. import ray os.environ["RAY_ADDRESS"] = ray.get_runtime_context().gcs_address - reason = "In a Ray actor and can only be spawned" + reasons.append("In a Ray actor and can only be spawned") - if reason is not None: + if cuda_is_initialized(): + reasons.append("CUDA is initialized") + elif xpu_is_initialized(): + reasons.append("XPU is initialized") + + if reasons: logger.warning( "We must use the `spawn` multiprocessing start method. " "Overriding VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. " "See https://docs.vllm.ai/en/latest/usage/" "troubleshooting.html#python-multiprocessing " - "for more information. Reason: %s", reason) + "for more information. Reasons: %s", "; ".join(reasons)) os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" From f1b286b2fbbde18745d57b0ce7ac4fbc56f10f0d Mon Sep 17 00:00:00 2001 From: Chengji Yao Date: Fri, 25 Jul 2025 17:09:00 -0700 Subject: [PATCH 20/57] [TPU] Update ptxla nightly version to 20250724 (#21555) Signed-off-by: Chengji Yao --- docker/Dockerfile.tpu | 2 +- requirements/tpu.txt | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/docker/Dockerfile.tpu b/docker/Dockerfile.tpu index 3474ff50de7bd..b9fc9def88190 100644 --- a/docker/Dockerfile.tpu +++ b/docker/Dockerfile.tpu @@ -1,4 +1,4 @@ -ARG NIGHTLY_DATE="20250714" +ARG NIGHTLY_DATE="20250724" ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.12_tpuvm_$NIGHTLY_DATE" FROM $BASE_IMAGE diff --git a/requirements/tpu.txt b/requirements/tpu.txt index d86f643d388ba..2d0d8bd8457e3 100644 --- a/requirements/tpu.txt +++ b/requirements/tpu.txt @@ -19,8 +19,8 @@ nixl==0.3.0 --find-links https://storage.googleapis.com/libtpu-releases/index.html --find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html -torch==2.9.0.dev20250716 -torchvision==0.24.0.dev20250716 -torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.9.0.dev20250716-cp311-cp311-linux_x86_64.whl ; python_version == "3.11" -torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.9.0.dev20250716-cp312-cp312-linux_x86_64.whl ; python_version == "3.12" +torch==2.9.0.dev20250724 +torchvision==0.24.0.dev20250724 +torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.9.0.dev20250724-cp311-cp311-linux_x86_64.whl ; python_version == "3.11" +torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.9.0.dev20250724-cp312-cp312-linux_x86_64.whl ; python_version == "3.12" From 7ae75fa6d02afc45637b060ce7a535a1bd547afd Mon Sep 17 00:00:00 2001 From: Alex Kogan <82225080+sakogan@users.noreply.github.com> Date: Fri, 25 Jul 2025 21:09:34 -0400 Subject: [PATCH 21/57] [Feature] Add support for MoE models in the calibration-free RTN-based quantization (#20766) Signed-off-by: Alex Kogan --- tests/quantization/test_rtn.py | 5 +- .../model_executor/layers/quantization/rtn.py | 234 +++++++++++++++--- 2 files changed, 201 insertions(+), 38 deletions(-) diff --git a/tests/quantization/test_rtn.py b/tests/quantization/test_rtn.py index 133b2d9e4df69..bc2b468f97d8c 100644 --- a/tests/quantization/test_rtn.py +++ b/tests/quantization/test_rtn.py @@ -8,7 +8,10 @@ import pytest from tests.quantization.utils import is_quant_method_supported -MODELS = ["microsoft/Phi-3-mini-4k-instruct"] +MODELS = [ + "microsoft/Phi-3-mini-4k-instruct", # dense model + "ai21labs/Jamba-tiny-dev", # MoE model +] @pytest.mark.skipif(not is_quant_method_supported("rtn"), diff --git a/vllm/model_executor/layers/quantization/rtn.py b/vllm/model_executor/layers/quantization/rtn.py index 68309716cf901..cceaf9857c40f 100644 --- a/vllm/model_executor/layers/quantization/rtn.py +++ b/vllm/model_executor/layers/quantization/rtn.py @@ -3,18 +3,19 @@ # Copyright © 2025, Oracle and/or its affiliates. import os -from typing import Any, Optional +from typing import Any, Callable, Optional import torch import torch.nn.functional as F from torch.nn.parameter import Parameter from vllm.logger import init_logger +from vllm.model_executor.layers.fused_moe import FusedMoE, FusedMoEMethodBase from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase, set_weight_attrs) from vllm.model_executor.layers.quantization import QuantizationMethods from vllm.model_executor.layers.quantization.base_config import ( - QuantizationConfig) + QuantizationConfig, QuantizeMethodBase) logger = init_logger(__name__) """By default, use 8 bit as target precision, but it can be @@ -71,9 +72,11 @@ class RTNConfig(QuantizationConfig): return cls(weight_bits, group_size) def get_quant_method(self, layer: torch.nn.Module, - prefix: str) -> Optional["RTNLinearMethod"]: + prefix: str) -> Optional["QuantizeMethodBase"]: if isinstance(layer, LinearBase): return RTNLinearMethod(self) + elif isinstance(layer, FusedMoE): + return RTNMoEMethod(self) return None @@ -94,11 +97,18 @@ class RTNTensor: self.data.narrow(dim, start // factor, length // factor), self.scale.narrow(dim, start, length), self.quant_config) + def __getitem__(self, key): + return RTNTensor(self.data[key], self.scale[key], self.quant_config) + @property def shape(self): shape = self.data.shape factor = 1 if self.quant_config.weight_bits == 8 else 2 - return torch.Size((shape[0] * factor, shape[1])) + batch_present = len(shape) == 3 + if batch_present: + return torch.Size((shape[0], shape[1] * factor, shape[2])) + else: + return torch.Size((shape[0] * factor, shape[1])) def copy_(self, loaded_weight: torch.Tensor) -> None: qweight, weight_scale = rtn_quantize(loaded_weight.cuda(), @@ -165,7 +175,7 @@ class RTNLinearMethod(LinearMethodBase): weight = RTNParameter(data=torch.empty(output_size_per_partition // factor, input_size_per_partition, - dtype=torch.int8), + dtype=torch.uint8), scale=scale, quant_config=self.quant_config) @@ -180,18 +190,7 @@ class RTNLinearMethod(LinearMethodBase): layer.output_size_per_partition = output_size_per_partition def process_weights_after_loading(self, layer: torch.nn.Module) -> None: - """torch.compile does not know how to deal with a Parameter subclass - (aka RTNParameter). As we don't really need RTNParameters for the - forward pass, we replace them with equivalent instances of Parameters. - """ - old_weight = layer.weight - assert isinstance(old_weight, RTNParameter) - data = old_weight.data.data - - delattr(layer, "weight") - - new_weight = Parameter(data=data, requires_grad=False) - layer.register_parameter("weight", new_weight) + fix_weights(layer, "weight") def apply(self, layer: torch.nn.Module, @@ -209,6 +208,128 @@ class RTNLinearMethod(LinearMethodBase): return out +class RTNMoEMethod(FusedMoEMethodBase): + + def __init__(self, quant_config: RTNConfig): + self.quant_config = quant_config + + def create_weights(self, layer: torch.nn.Module, num_experts: int, + hidden_size: int, intermediate_size_per_partition: int, + params_dtype: torch.dtype, **extra_weight_attrs): + + factor = 1 if self.quant_config.weight_bits == 8 else 2 + + # Fused gate_up_proj (column parallel) + num_groups_per_col = (hidden_size // self.quant_config.group_size + if self.quant_config.group_size != -1 else 1) + w13_scale = Parameter( + torch.empty(num_experts, + 2 * intermediate_size_per_partition, + num_groups_per_col, + dtype=params_dtype), + requires_grad=False, + ) + layer.register_parameter("w13_scale", w13_scale) + + w13_weight = RTNParameter(data=torch.empty( + num_experts, + 2 * intermediate_size_per_partition // factor, + hidden_size, + dtype=torch.uint8), + scale=w13_scale, + quant_config=self.quant_config) + layer.register_parameter("w13_weight", w13_weight) + set_weight_attrs(w13_weight, extra_weight_attrs) + + # down_proj (row parallel) + num_groups_per_col = (intermediate_size_per_partition // + self.quant_config.group_size + if self.quant_config.group_size != -1 else 1) + w2_scale = Parameter(torch.zeros(num_experts, + hidden_size, + num_groups_per_col, + dtype=params_dtype), + requires_grad=False) + layer.register_parameter("w2_scale", w2_scale) + + w2_weight = RTNParameter(data=torch.empty( + num_experts, + hidden_size // factor, + intermediate_size_per_partition, + dtype=torch.uint8), + scale=w2_scale, + quant_config=self.quant_config) + layer.register_parameter("w2_weight", w2_weight) + set_weight_attrs(w2_weight, extra_weight_attrs) + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + weight_bits = self.quant_config.weight_bits + fix_weights(layer, "w13_weight", weight_bits == 4) + fix_weights(layer, "w2_weight", weight_bits == 4) + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + router_logits: torch.Tensor, + top_k: int, + renormalize: bool, + use_grouped_topk: bool = False, + topk_group: Optional[int] = None, + num_expert_group: Optional[int] = None, + global_num_experts: int = -1, + expert_map: Optional[torch.Tensor] = None, + custom_routing_function: Optional[Callable] = None, + scoring_func: str = "softmax", + e_score_correction_bias: Optional[torch.Tensor] = None, + apply_router_weight_on_input: bool = False, + activation: str = "silu", + enable_eplb: bool = False, + expert_load_view: Optional[torch.Tensor] = None, + logical_to_physical_map: Optional[torch.Tensor] = None, + logical_replica_count: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + if enable_eplb: + raise NotImplementedError( + "EPLB not supported for `RTNMoEMethod` yet.") + + from vllm.model_executor.layers.fused_moe import fused_experts + + topk_weights, topk_ids = FusedMoE.select_experts( + hidden_states=x, + router_logits=router_logits, + use_grouped_topk=use_grouped_topk, + top_k=top_k, + renormalize=renormalize, + topk_group=topk_group, + num_expert_group=num_expert_group, + custom_routing_function=custom_routing_function, + scoring_func=scoring_func, + e_score_correction_bias=e_score_correction_bias) + + weight_bits = self.quant_config.weight_bits + group_size = self.quant_config.group_size + + ret = fused_experts( + x, + layer.w13_weight, + layer.w2_weight, + topk_weights=topk_weights, + topk_ids=topk_ids, + inplace=True, + activation=activation, + use_int4_w4a16=weight_bits == 4, + use_int8_w8a16=weight_bits == 8, + global_num_experts=global_num_experts, + w1_scale=layer.w13_scale, + w2_scale=layer.w2_scale, + apply_router_weight_on_input=apply_router_weight_on_input, + expert_map=expert_map, + block_shape=[0, group_size]) + + return ret + + def rtn_quantize(tensor: torch.Tensor, num_bits: int, group_size: int) -> tuple[torch.Tensor, torch.Tensor]: """Quantize a tensor using per-group static scaling factor. @@ -221,34 +342,44 @@ def rtn_quantize(tensor: torch.Tensor, num_bits: int, If equal to -1, each row in the input tensor is treated as one group. """ + batch_present = len(tensor.shape) == 3 + if not batch_present: + tensor = tensor.unsqueeze(0) q_range = 2**num_bits - num_groups = (tensor.shape[0] * tensor.shape[1] // - group_size if group_size != -1 else tensor.shape[0]) + num_groups = (tensor.shape[1] * tensor.shape[2] // + group_size if group_size != -1 else tensor.shape[1]) """Calculate a scaling factor per input group. """ - input_flat = tensor.reshape(num_groups, -1) - input_min = torch.min(input_flat, dim=1, keepdim=True)[0] - input_max = torch.max(input_flat, dim=1, keepdim=True)[0] + input_flat = tensor.reshape(tensor.shape[0], num_groups, -1) + input_min = torch.min(input_flat, dim=2, keepdim=True)[0] + input_max = torch.max(input_flat, dim=2, keepdim=True)[0] input_max_abs = torch.max(input_min.abs(), input_max.abs()) scale = (input_max_abs * 2.0 / (q_range - 1)) - """Scale each input group, truncate and round to the nearest integer. + """Scale each input group, round to the nearest integer, shift + the range and truncate. """ scaled_input = input_flat / scale - scaled_input = scaled_input.clamp(-q_range // 2, q_range // 2 - 1) scaled_input = scaled_input.round() + scaled_input += q_range // 2 + scaled_input = scaled_input.clamp(0, q_range - 1) - scale = scale.reshape(tensor.shape[0], -1).contiguous() - inputs_q = scaled_input.reshape(tensor.shape).to(torch.int8) + scale = scale.reshape(tensor.shape[0], tensor.shape[1], -1).contiguous() + inputs_q = scaled_input.reshape(tensor.shape).to(torch.uint8) inputs_q = inputs_q.contiguous() if num_bits == 4: """Pack two 4-bit values into each byte. """ - inputs_q = (inputs_q[:, 1::2] << 4) | (inputs_q[:, ::2] & 0xf) - inputs_q = inputs_q.reshape(tensor.shape[0] // 2, tensor.shape[1]) + inputs_q = (inputs_q[:, :, 1::2] << 4) | (inputs_q[:, :, ::2] & 0xf) + inputs_q = inputs_q.reshape(tensor.shape[0], tensor.shape[1] // 2, + tensor.shape[2]) inputs_q = inputs_q.contiguous() + if not batch_present: + inputs_q = inputs_q.squeeze(0) + scale = scale.squeeze(0) + return inputs_q, scale @@ -259,31 +390,60 @@ def rtn_dequantize(tensor: torch.Tensor, scale: torch.Tensor) -> torch.Tensor: tensor: The input tensor. scale: The tensor with per-group scale factors. """ + batch_present = len(tensor.shape) == 3 + if not batch_present: + tensor = tensor.unsqueeze(0) + scale = scale.unsqueeze(0) - num_groups = scale.size(0) * scale.size(1) - input_dim, output_dim = tensor.shape + num_groups = scale.size(1) * scale.size(2) + batch, input_dim, output_dim = tensor.shape - num_bits = 8 if input_dim == scale.size(0) else 4 + num_bits = 8 if input_dim == scale.size(1) else 4 + q_range = 2**num_bits if num_bits == 4: input_dim *= 2 - data = torch.empty((input_dim, output_dim), + data = torch.empty((batch, input_dim, output_dim), dtype=scale.dtype, device=tensor.device) if num_bits == 8: data.copy_(tensor) + data -= q_range // 2 else: """Unpack two 4-bit values from each byte. """ - tensor = tensor.reshape(input_dim, output_dim // 2) + tensor = tensor.reshape(batch, input_dim, output_dim // 2) for i in range(2): - data[:, i::2] = (tensor << 4 * (1 - i)) >> 4 + data[:, :, i::2] = ((tensor << 4 * + (1 - i)) >> 4).to(torch.int8) - q_range // 2 """Scale each input group with its scaling factor. """ - scale = scale.reshape(num_groups, -1) - data = data.reshape(num_groups, -1) + scale = scale.reshape(batch, num_groups, -1) + data = data.reshape(batch, num_groups, -1) data = torch.mul(data, scale) - input_deq = data.reshape((input_dim, output_dim)).contiguous() + input_deq = data.reshape((batch, input_dim, output_dim)).contiguous() + if not batch_present: + input_deq = input_deq.squeeze(0) + return input_deq + + +def fix_weights(layer: torch.nn.Module, + param_name: str, + reshape: bool = False): + """torch.compile does not know how to deal with a Parameter subclass + (aka RTNParameter). As we don't really need RTNParameters for the + forward pass, we replace them with equivalent instances of Parameters. + """ + old_weight = getattr(layer, param_name) + assert isinstance(old_weight, RTNParameter) + data = old_weight.data.data + + delattr(layer, param_name) + + if reshape: + data = data.reshape(old_weight.shape[0], old_weight.shape[1] * 2, -1) + new_weight = Parameter(data=data, requires_grad=False) + layer.register_parameter(param_name, new_weight) From 62965de5fe8be8e3622952a9b5cda86973cf9c51 Mon Sep 17 00:00:00 2001 From: Farzad Abdolhosseini Date: Sat, 26 Jul 2025 04:12:31 +0300 Subject: [PATCH 22/57] [Model] Ultravox: Support Llama 4 and Gemma 3 backends (#17818) Signed-off-by: Farzad Abdolhosseini Signed-off-by: Patrick Li Co-authored-by: Patrick Li --- tests/models/registry.py | 2 ++ vllm/model_executor/models/registry.py | 1 + vllm/model_executor/models/ultravox.py | 38 +++++++++++++-------- vllm/transformers_utils/configs/ultravox.py | 22 +++++++----- 4 files changed, 39 insertions(+), 24 deletions(-) diff --git a/tests/models/registry.py b/tests/models/registry.py index 1800262ced67f..b41e432d738a7 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -221,6 +221,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { "fp8": "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8"}), # noqa: E501 "LLaMAForCausalLM": _HfExamplesInfo("decapoda-research/llama-7b-hf", is_available_online=False), + "Llama4ForCausalLM": _HfExamplesInfo("meta-llama/Llama-4-Scout-17B-16E-Instruct", # noqa: E501 + is_available_online=False), "MambaForCausalLM": _HfExamplesInfo("state-spaces/mamba-130m-hf"), "Mamba2ForCausalLM": _HfExamplesInfo("mistralai/Mamba-Codestral-7B-v0.1"), "FalconMambaForCausalLM": _HfExamplesInfo("tiiuae/falcon-mamba-7b-instruct"), # noqa: E501 diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 14a8ac7876f73..9b204fdcbe1a5 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -89,6 +89,7 @@ _TEXT_GENERATION_MODELS = { "JAISLMHeadModel": ("jais", "JAISLMHeadModel"), "JambaForCausalLM": ("jamba", "JambaForCausalLM"), "LlamaForCausalLM": ("llama", "LlamaForCausalLM"), + "Llama4ForCausalLM": ("llama4", "Llama4ForCausalLM"), # noqa: E501 # For decapoda-research/llama-* "LLaMAForCausalLM": ("llama", "LlamaForCausalLM"), "MambaForCausalLM": ("mamba", "MambaForCausalLM"), diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index 3697e3fd0cf43..a4569ccd5a845 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -39,9 +39,7 @@ from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, merge_multimodal_embeddings, merge_multimodal_embeddings_from_map) -_AUDIO_PLACEHOLDER_OVERRIDE = "<|reserved_special_token_0|>" -_AUDIO_PLACEHOLDER_TOKEN = 128002 -_AUDIO_TOKENS_PER_SECOND = 6.25 +_AUDIO_PLACEHOLDER_OVERRIDE = "<|audio|>" _MAX_ENCODER_BATCH_SIZE = 16 @@ -80,14 +78,15 @@ class UltravoxProcessingInfo(BaseProcessingInfo): sampling_rate: Optional[int] = None, **kwargs: object, ) -> ProcessorMixin: + config = self.ctx.model_config.hf_config hf_processor = self.ctx.get_hf_processor(**kwargs) # NOTE: Ultravox processing definition uses '<|eot_id|>' as the # placeholder that will cause confusion with the actual end of turn - # token, thus we override placeholder with a reserved special - # token. + # token, thus we override placeholder with a reserved token. hf_processor.audio_token_replacement = _AUDIO_PLACEHOLDER_OVERRIDE - hf_processor.audio_replacement_token_id = _AUDIO_PLACEHOLDER_TOKEN + hf_processor.audio_replacement_token_id = config.audio_token_index + return hf_processor def get_feature_extractor( @@ -274,7 +273,7 @@ class UltravoxProjector(nn.Module): else: self.act = get_act_fn(config.projector_act) - dim_out = config.text_config.hidden_size + dim_out = config.text_hidden_size self.linear_2 = nn.Linear(dim_mid, dim_out, bias=False) # Ultravox v0.4.1 and below use layer_norm after the second linear layer @@ -572,9 +571,14 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA): input_ids: torch.Tensor, multimodal_embeddings: Optional[MultiModalEmbeddings] = None, ) -> torch.Tensor: - inputs_embeds = self.language_model.get_input_embeddings(input_ids) - if multimodal_embeddings is not None \ - and len(multimodal_embeddings) != 0: + # The audio token index is not included in the embedding table + # We need to remove it before embedding lookup + safe_input_ids = input_ids.clone() + safe_input_ids[safe_input_ids == self.config.audio_token_index] = 0 + inputs_embeds = self.language_model.get_input_embeddings( + safe_input_ids) + if multimodal_embeddings is not None and len( + multimodal_embeddings) > 0: # TODO(ywang96): remove this block after v0 is deprecated. if not envs.VLLM_USE_V1: @@ -585,7 +589,7 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA): else: inputs_embeds = merge_multimodal_embeddings( input_ids, inputs_embeds, multimodal_embeddings, - _AUDIO_PLACEHOLDER_TOKEN) + self.config.audio_token_index) return inputs_embeds def forward(self, @@ -623,10 +627,14 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA): multimodal_embeddings) input_ids = None - hidden_states = self.language_model.model(input_ids, - positions, - intermediate_tensors, - inputs_embeds=inputs_embeds) + language_model = self.language_model + if hasattr(language_model, "language_model"): + language_model = language_model.language_model + + hidden_states = language_model.model(input_ids, + positions, + intermediate_tensors, + inputs_embeds=inputs_embeds) return hidden_states def compute_logits(self, hidden_states: torch.Tensor, diff --git a/vllm/transformers_utils/configs/ultravox.py b/vllm/transformers_utils/configs/ultravox.py index 62f63b02d49a4..87064cc12deda 100644 --- a/vllm/transformers_utils/configs/ultravox.py +++ b/vllm/transformers_utils/configs/ultravox.py @@ -45,6 +45,7 @@ class UltravoxConfig(transformers.PretrainedConfig): """ model_type = "ultravox" + audio_token = "<|audio|>" is_composition = False def __init__( @@ -80,29 +81,32 @@ class UltravoxConfig(transformers.PretrainedConfig): # Avoid circular import from vllm.transformers_utils.config import get_config - self.text_config = get_config(text_model_id, - trust_remote_code=False) + text_config_obj = get_config(text_model_id, + trust_remote_code=False) else: text_config = text_config or {} - self.text_config = transformers.CONFIG_MAPPING[text_config.get( + text_config_obj = transformers.CONFIG_MAPPING[text_config.get( "model_type", "llama")](**text_config) + inner_text_config = text_config_obj.get_text_config() + if audio_model_id is not None: # Avoid circular import from vllm.transformers_utils.config import get_config - self.audio_config = get_config(audio_model_id, - trust_remote_code=False) + audio_config = get_config(audio_model_id, trust_remote_code=False) else: audio_config = audio_config or {} - self.audio_config = transformers.CONFIG_MAPPING[audio_config.get( + audio_config = transformers.CONFIG_MAPPING[audio_config.get( "model_type", "whisper")](**audio_config) + self.text_config = text_config_obj + self.audio_config = audio_config self.text_model_lora_config = text_model_lora_config or {} self.audio_model_lora_config = audio_model_lora_config or {} - self.vocab_size = self.text_config.vocab_size - - self.initializer_range = self.text_config.initializer_range + self.vocab_size = inner_text_config.vocab_size + self.initializer_range = inner_text_config.initializer_range + self.text_hidden_size = inner_text_config.hidden_size super().__init__(**kwargs) From 97349fe2bc68de69550787135c1a8c6b85fc8d81 Mon Sep 17 00:00:00 2001 From: WeiQing Chen <40507679+david6666666@users.noreply.github.com> Date: Sat, 26 Jul 2025 09:37:32 +0800 Subject: [PATCH 23/57] [Docs] add offline serving multi-modal video input expamle Qwen2.5-VL (#21530) Signed-off-by: David Chen <530634352@qq.com> --- docs/features/multimodal_inputs.md | 64 ++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md index e820ace4f8fe7..e83dfdb11dadc 100644 --- a/docs/features/multimodal_inputs.md +++ b/docs/features/multimodal_inputs.md @@ -177,6 +177,70 @@ Multi-image input can be extended to perform video captioning. We show this with You can pass a list of NumPy arrays directly to the `'video'` field of the multi-modal dictionary instead of using multi-image input. +Instead of NumPy arrays, you can also pass `'torch.Tensor'` instances, as shown in this example using Qwen2.5-VL: + +??? code + + ```python + from transformers import AutoProcessor + from vllm import LLM, SamplingParams + from qwen_vl_utils import process_vision_info + + model_path = "Qwen/Qwen2.5-VL-3B-Instruct/" + video_path = "https://content.pexels.com/videos/free-videos.mp4" + + llm = LLM( + model=model_path, + gpu_memory_utilization=0.8, + enforce_eager=True, + limit_mm_per_prompt={"video": 1}, + ) + + sampling_params = SamplingParams( + max_tokens=1024, + ) + + video_messages = [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": [ + {"type": "text", "text": "describe this video."}, + { + "type": "video", + "video": video_path, + "total_pixels": 20480 * 28 * 28, + "min_pixels": 16 * 28 * 28 + } + ] + }, + ] + + messages = video_messages + processor = AutoProcessor.from_pretrained(model_path) + prompt = processor.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True, + ) + + image_inputs, video_inputs = process_vision_info(messages) + mm_data = {} + if video_inputs is not None: + mm_data["video"] = video_inputs + + llm_inputs = { + "prompt": prompt, + "multi_modal_data": mm_data, + } + + outputs = llm.generate([llm_inputs], sampling_params=sampling_params) + for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) + ``` + + !!! note + 'process_vision_info' is only applicable to Qwen2.5-VL and similar models. + Full example: ### Audio Inputs From a55c95096b3537edfbbb7a5eafae0b0475c5ef07 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Fri, 25 Jul 2025 19:06:21 -0700 Subject: [PATCH 24/57] Correctly kill vLLM processes after finishing serving benchmarks (#21641) Signed-off-by: Huy Do --- .../scripts/run-nightly-benchmarks.sh | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh index 4d01a314adc47..4162905bb3cc3 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh @@ -95,12 +95,14 @@ json2args() { } kill_gpu_processes() { - pkill -f python - pkill -f python3 - pkill -f tritonserver - pkill -f pt_main_thread - pkill -f text-generation - pkill -f lmdeploy + pkill -f '[p]ython' + pkill -f '[p]ython3' + pkill -f '[t]ritonserver' + pkill -f '[p]t_main_thread' + pkill -f '[t]ext-generation' + pkill -f '[l]mdeploy' + # vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445 + pkill -f '[V]LLM' while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do sleep 1 From 2f6e6b33fb0acf5331982c8e1ea620005e3a19ba Mon Sep 17 00:00:00 2001 From: Alexandre JUAN Date: Sat, 26 Jul 2025 05:11:10 +0200 Subject: [PATCH 25/57] [Bugfix] Fix isinstance check for tensor types in _load_prompt_embeds to use dtype comparison (#21612) Signed-off-by: Alexandre Juan --- vllm/entrypoints/openai/serving_engine.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 9d848679d5d98..71976fea1ee77 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -957,9 +957,11 @@ class OpenAIServing: def _load_and_validate_embed(embed: bytes) -> EmbedsPrompt: tensor = torch.load(io.BytesIO(base64.b64decode(embed)), weights_only=True) - assert isinstance( - tensor, - (torch.FloatTensor, torch.BFloat16Tensor, torch.HalfTensor)) + assert isinstance(tensor, torch.Tensor) and tensor.dtype in ( + torch.float32, + torch.bfloat16, + torch.float16, + ) if tensor.dim() > 2: tensor = tensor.squeeze(0) assert tensor.dim() == 2 From 7728dd77bb802e1876012eb264df4d2fa2fc6f3c Mon Sep 17 00:00:00 2001 From: QiliangCui Date: Fri, 25 Jul 2025 23:20:30 -0700 Subject: [PATCH 26/57] [TPU][Test] Divide TPU v1 Test into 2 parts. (#21431) --- .../hardware_ci/run-tpu-v1-test-part2.sh | 166 ++++++++++++++++++ .../scripts/hardware_ci/run-tpu-v1-test.sh | 12 -- 2 files changed, 166 insertions(+), 12 deletions(-) create mode 100755 .buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh new file mode 100755 index 0000000000000..d998c1f73b514 --- /dev/null +++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh @@ -0,0 +1,166 @@ +#!/bin/bash + +set -xu + + +remove_docker_container() { + docker rm -f tpu-test || true; + docker rm -f vllm-tpu || true; +} + +trap remove_docker_container EXIT + +# Remove the container that might not be cleaned up in the previous run. +remove_docker_container + +# Build the docker image. +docker build -f docker/Dockerfile.tpu -t vllm-tpu . + +# Set up cleanup. +cleanup_docker() { + # Get Docker's root directory + docker_root=$(docker info -f '{{.DockerRootDir}}') + if [ -z "$docker_root" ]; then + echo "Failed to determine Docker root directory." + exit 1 + fi + echo "Docker root directory: $docker_root" + # Check disk usage of the filesystem where Docker's root directory is located + disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//') + # Define the threshold + threshold=70 + if [ "$disk_usage" -gt "$threshold" ]; then + echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..." + # Remove dangling images (those that are not tagged and not used by any container) + docker image prune -f + # Remove unused volumes / force the system prune for old images as well. + docker volume prune -f && docker system prune --force --filter "until=72h" --all + echo "Docker images and volumes cleanup completed." + else + echo "Disk usage is below $threshold%. No cleanup needed." + fi +} +cleanup_docker + +# For HF_TOKEN. +source /etc/environment + +docker run --privileged --net host --shm-size=16G -it \ + -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \ + vllm-tpu /bin/bash -c ' +set -e # Exit immediately if a command exits with a non-zero status. +set -u # Treat unset variables as an error. + +echo "--- Starting script inside Docker container ---" + +# Create results directory +RESULTS_DIR=$(mktemp -d) +# If mktemp fails, set -e will cause the script to exit. +echo "Results will be stored in: $RESULTS_DIR" + +# Install dependencies +echo "--- Installing Python dependencies ---" +python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \ + && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \ + && python3 -m pip install --progress-bar off lm_eval[api]==0.4.4 \ + && python3 -m pip install --progress-bar off hf-transfer +echo "--- Python dependencies installed ---" +export VLLM_USE_V1=1 +export VLLM_XLA_CHECK_RECOMPILATION=1 +export VLLM_XLA_CACHE_PATH= +echo "Using VLLM V1" + +echo "--- Hardware Information ---" +# tpu-info +echo "--- Starting Tests ---" +set +e +overall_script_exit_code=0 + +# --- Test Definitions --- +# If a test fails, this function will print logs and will not cause the main script to exit. +run_test() { + local test_num=$1 + local test_name=$2 + local test_command=$3 + local log_file="$RESULTS_DIR/test_${test_num}.log" + local actual_exit_code + + echo "--- TEST_$test_num: Running $test_name ---" + + # Execute the test command. + eval "$test_command" > >(tee -a "$log_file") 2> >(tee -a "$log_file" >&2) + actual_exit_code=$? + + echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" # This goes to main log + echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" >> "$log_file" # Also to per-test log + + if [ "$actual_exit_code" -ne 0 ]; then + echo "TEST_$test_num ($test_name) FAILED with exit code $actual_exit_code." >&2 + echo "--- Log for failed TEST_$test_num ($test_name) ---" >&2 + if [ -f "$log_file" ]; then + cat "$log_file" >&2 + else + echo "Log file $log_file not found for TEST_$test_num ($test_name)." >&2 + fi + echo "--- End of log for TEST_$test_num ($test_name) ---" >&2 + return "$actual_exit_code" # Return the failure code + else + echo "TEST_$test_num ($test_name) PASSED." + return 0 # Return success + fi +} + +# Helper function to call run_test and update the overall script exit code +run_and_track_test() { + local test_num_arg="$1" + local test_name_arg="$2" + local test_command_arg="$3" + + # Run the test + run_test "$test_num_arg" "$test_name_arg" "$test_command_arg" + local test_specific_exit_code=$? + + # If the test failed, set the overall script exit code to 1 + if [ "$test_specific_exit_code" -ne 0 ]; then + # No need for extra echo here, run_test already logged the failure. + overall_script_exit_code=1 + fi +} + +# --- Actual Test Execution --- +run_and_track_test 1 "test_struct_output_generate.py" \ + "HF_HUB_DISABLE_XET=1 python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\"" +run_and_track_test 2 "test_moe_pallas.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py" +run_and_track_test 3 "test_lora.py" \ + "VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py" +run_and_track_test 4 "test_tpu_qkv_linear.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py" +run_and_track_test 5 "test_spmd_model_weight_loading.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py" +run_and_track_test 6 "test_kv_cache_update_kernel.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_kv_cache_update_kernel.py" + +# After all tests have been attempted, exit with the overall status. +if [ "$overall_script_exit_code" -ne 0 ]; then + echo "--- One or more tests FAILED. Overall script exiting with failure code 1. ---" +else + echo "--- All tests have completed and PASSED. Overall script exiting with success code 0. ---" +fi +exit "$overall_script_exit_code" +' # IMPORTANT: This is the closing single quote for the bash -c "..." command. Ensure it is present and correct. + +# Capture the exit code of the docker run command +DOCKER_RUN_EXIT_CODE=$? + +# The trap will run for cleanup. +# Exit the main script with the Docker run command's exit code. +if [ "$DOCKER_RUN_EXIT_CODE" -ne 0 ]; then + echo "Docker run command failed with exit code $DOCKER_RUN_EXIT_CODE." + exit "$DOCKER_RUN_EXIT_CODE" +else + echo "Docker run command completed successfully." + exit 0 +fi +# TODO: This test fails because it uses RANDOM_SEED sampling +# pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \ diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh index 5514d7770cff8..e565d4b246945 100755 --- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh +++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh @@ -150,18 +150,6 @@ run_and_track_test 9 "test_multimodal.py" \ "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py" run_and_track_test 10 "test_pallas.py" \ "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py" -run_and_track_test 11 "test_struct_output_generate.py" \ - "HF_HUB_DISABLE_XET=1 python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\"" -run_and_track_test 12 "test_moe_pallas.py" \ - "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py" -run_and_track_test 13 "test_lora.py" \ - "VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py" -run_and_track_test 14 "test_tpu_qkv_linear.py" \ - "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py" -run_and_track_test 15 "test_spmd_model_weight_loading.py" \ - "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py" -run_and_track_test 16 "test_kv_cache_update_kernel.py" \ - "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_kv_cache_update_kernel.py" # After all tests have been attempted, exit with the overall status. if [ "$overall_script_exit_code" -ne 0 ]; then From 875af38e01217f20827f0b4e1353b91c884b9d53 Mon Sep 17 00:00:00 2001 From: Lyu Han Date: Sat, 26 Jul 2025 19:14:04 +0800 Subject: [PATCH 27/57] Support Intern-S1 (#21628) Signed-off-by: Roger Wang Signed-off-by: Isotr0py <2037008807@qq.com> Signed-off-by: Isotr0py Co-authored-by: Your Name Co-authored-by: Roger Wang Co-authored-by: Isotr0py <2037008807@qq.com> Co-authored-by: Isotr0py --- docs/models/supported_models.md | 1 + examples/offline_inference/vision_language.py | 32 + .../vision_language_multi_image.py | 28 + tests/models/registry.py | 2 + vllm/model_executor/models/interns1.py | 711 ++++++++++++++++++ vllm/model_executor/models/interns1_vit.py | 421 +++++++++++ vllm/model_executor/models/registry.py | 1 + 7 files changed, 1196 insertions(+) create mode 100644 vllm/model_executor/models/interns1.py create mode 100644 vllm/model_executor/models/interns1_vit.py diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 0f3b730eabedc..3847fc15119fd 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -593,6 +593,7 @@ Specified using `--task generate`. | `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | ✅︎ | | `H2OVLChatModel` | H2OVL | T + IE+ | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎ | | `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | | ✅︎ | +| `InternS1ForConditionalGeneration` | Intern-S1 | T + IE+ | `internlm/Intern-S1`, etc. | | ✅︎ | ✅︎ | | `InternVLChatModel` | InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + IE+ + (VE+) | `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | ✅︎ | | `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + IE+ + VE+ | `Kwai-Keye/Keye-VL-8B-Preview` | | | ✅︎ | | `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I+ | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | | ✅︎ | diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index eb6b410848558..61f5525c6d7e7 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -468,6 +468,37 @@ def run_tarsier(questions: list[str], modality: str) -> ModelRequestData: ) +# Intern-S1 +def run_interns1(questions: list[str], modality: str) -> ModelRequestData: + assert modality == "image" + + model_name = "internlm/Intern-S1" + + engine_args = EngineArgs( + model=model_name, + trust_remote_code=True, + max_model_len=8192, + max_num_seqs=2, + limit_mm_per_prompt={modality: 1}, + enforce_eager=True, + ) + + placeholder = "" + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + messages = [ + [{"role": "user", "content": f"{placeholder}\n{question}"}] + for question in questions + ] + prompts = tokenizer.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) + + # InternVL def run_internvl(questions: list[str], modality: str) -> ModelRequestData: model_name = "OpenGVLab/InternVL3-2B" @@ -1303,6 +1334,7 @@ model_example_map = { "h2ovl_chat": run_h2ovl, "hyperclovax_seed_vision": run_hyperclovax_seed_vision, "idefics3": run_idefics3, + "interns1": run_interns1, "internvl_chat": run_internvl, "nemotron_vl": run_nemotron_vl, "keye_vl": run_keye_vl, diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py index 2e14fc807e104..e312a0953e9be 100644 --- a/examples/offline_inference/vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -253,6 +253,33 @@ def load_smolvlm(question: str, image_urls: list[str]) -> ModelRequestData: ) +def load_interns1(question: str, image_urls: list[str]) -> ModelRequestData: + model_name = "internlm/Intern-S1" + + engine_args = EngineArgs( + model=model_name, + trust_remote_code=True, + max_model_len=4096, + limit_mm_per_prompt={"image": len(image_urls)}, + ) + + placeholders = "\n".join( + f"Image-{i}: \n" for i, _ in enumerate(image_urls, start=1) + ) + messages = [{"role": "user", "content": f"{placeholders}\n{question}"}] + + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + prompt = tokenizer.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + + return ModelRequestData( + engine_args=engine_args, + prompt=prompt, + image_data=[fetch_image(url) for url in image_urls], + ) + + def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData: model_name = "OpenGVLab/InternVL2-2B" @@ -946,6 +973,7 @@ model_example_map = { "gemma3": load_gemma3, "h2ovl_chat": load_h2ovl, "idefics3": load_idefics3, + "interns1": load_interns1, "internvl_chat": load_internvl, "hyperclovax_seed_vision": load_hyperclovax_seed_vision, "keye_vl": load_keye_vl, diff --git a/tests/models/registry.py b/tests/models/registry.py index b41e432d738a7..0dc5aec8db12e 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -381,6 +381,8 @@ _MULTIMODAL_EXAMPLE_MODELS = { extras={"2B": "OpenGVLab/InternVL2-2B", "3.0": "OpenGVLab/InternVL3-1B"}, # noqa: E501 trust_remote_code=True), + "InternS1ForConditionalGeneration": _HfExamplesInfo("internlm/Intern-S1", + trust_remote_code=True), "Idefics3ForConditionalGeneration": _HfExamplesInfo("HuggingFaceM4/Idefics3-8B-Llama3", # noqa: E501 {"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"}), # noqa: E501 "KeyeForConditionalGeneration": _HfExamplesInfo("Kwai-Keye/Keye-VL-8B-Preview", # noqa: E501 diff --git a/vllm/model_executor/models/interns1.py b/vllm/model_executor/models/interns1.py new file mode 100644 index 0000000000000..36204e4c5953f --- /dev/null +++ b/vllm/model_executor/models/interns1.py @@ -0,0 +1,711 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# -------------------------------------------------------- +# InternS1 +# Copyright (c) 2025 Shanghai AI Lab +# Licensed under The MIT License [see LICENSE for details] +# -------------------------------------------------------- +from collections.abc import Iterable, Mapping, Sequence +from typing import Literal, Optional, TypedDict, Union + +import torch +import torch.nn as nn +from transformers import InternVLProcessor, PretrainedConfig +from transformers.activations import ACT2FN +from transformers.models.got_ocr2.image_processing_got_ocr2_fast import ( + GotOcr2ImageProcessorFast) + +from vllm.config import VllmConfig +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.models.interns1_vit import InternS1VisionModel +from vllm.model_executor.models.module_mapping import MultiModelKeys +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, + MultiModalKwargs, NestedTensors) +from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, + ImageSize, MultiModalDataItems) +from vllm.multimodal.processing import (BaseMultiModalProcessor, + BaseProcessingInfo, PromptReplacement, + PromptUpdate, PromptUpdateDetails) +from vllm.multimodal.profiling import BaseDummyInputsBuilder +from vllm.sequence import IntermediateTensors + +from .interfaces import (MultiModalEmbeddings, SupportsLoRA, + SupportsMultiModal, SupportsPP) +from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, + init_vllm_registered_model, maybe_prefix, + merge_multimodal_embeddings) + + +class InternS1MultiModalProjector(nn.Module): + + def __init__(self, config): + super().__init__() + self.layer_norm = nn.LayerNorm(config.vision_config.hidden_size * + int(1 / config.downsample_ratio)**2) + self.linear_1 = nn.Linear( + config.vision_config.hidden_size * + int(1 / config.downsample_ratio)**2, + config.text_config.hidden_size) + self.act = ACT2FN[config.projector_hidden_act] + self.linear_2 = nn.Linear(config.text_config.hidden_size, + config.text_config.hidden_size) + + def forward(self, image_features): + hidden_states = self.layer_norm(image_features) + hidden_states = self.linear_1(hidden_states) + hidden_states = self.act(hidden_states) + hidden_states = self.linear_2(hidden_states) + return hidden_states + + +class InternS1ImagePixelInputs(TypedDict): + type: Literal["pixel_values"] + pixel_values: torch.Tensor + """ + Shape: + `(batch_size * num_images * (1 + num_patches), num_channels, height, width)` + """ + + +class InternS1ImageEmbeddingInputs(TypedDict): + type: Literal["image_embeds"] + data: Union[torch.Tensor, list[torch.Tensor]] + """ + A tensor of shape `(num_images, total_image_feature_size, hidden_size)` + or a list of tensors of shape `(total_image_feature_size, hidden_size)` + + `hidden_size` must match the hidden size of language model backbone. + """ + + +InternS1ImageInputs = Union[InternS1ImagePixelInputs, + InternS1ImageEmbeddingInputs] + + +class InternS1VideoPixelInputs(TypedDict): + type: Literal["pixel_values_videos"] + pixel_values: torch.Tensor + """ + Shape: + `(batch_size * num_video * num_frames, num_channels, height, width)` + """ + + num_patches: torch.Tensor + """Shape: `(batch_size * num_images)`""" + + +class InternS1VideoEmbeddingInputs(TypedDict): + type: Literal["video_embeds"] + data: Union[torch.Tensor, list[torch.Tensor]] + """ + A tensor of shape `(num_videos, total_video_feature_size, hidden_size)` + or a list of tensors of shape `(total_video_feature_size, hidden_size)` + + `hidden_size` must match the hidden size of language model backbone. + """ + + +InternS1VideoInputs = Union[InternS1VideoPixelInputs, + InternS1VideoEmbeddingInputs] + + +def resolve_interns1_min_max_num( + min_dynamic_patch: int, + max_dynamic_patch: int, + dynamic_image_size: bool, + use_thumbnail: bool, +) -> tuple[int, int]: + min_dynamic_patch = min_dynamic_patch if dynamic_image_size else 1 + max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1 + + if use_thumbnail and max_dynamic_patch != 1: + max_dynamic_patch += 1 + + return min_dynamic_patch, max_dynamic_patch + + +def get_interns1_target_ratios( + min_num: int, + max_num: int, +) -> list[tuple[int, int]]: + target_ratios = {(i, j) + for n in range(min_num, max_num + 1) + for i in range(1, n + 1) + for j in range(1, n + 1) if min_num <= i * j <= max_num} + return sorted(target_ratios, key=lambda x: x[0] * x[1]) + + +class InternS1ProcessingInfo(BaseProcessingInfo): + """Basic image-only ProcessingInfo for InternS1-style models.""" + + def get_hf_processor(self, **kwargs: object) -> InternVLProcessor: + return self.ctx.get_hf_processor(InternVLProcessor, **kwargs) + + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"image": None} + + def get_num_image_tokens( + self, + *, + image_width: int, + image_height: int, + processor: Optional['GotOcr2ImageProcessorFast'] = None, + ) -> int: + if processor is None: + processor = self.get_hf_processor().image_processor + + if not isinstance(processor, GotOcr2ImageProcessorFast): + raise ValueError(f'GotOcr2ImageProcessorFast is expected but got ' + f'{type(processor)}') + num_image_patches = processor.get_number_of_image_tokens( + image_height, image_width, images_kwargs=dict()) + num_image_tokens = self.get_hf_processor( + ).image_seq_length * num_image_patches + return num_image_tokens + + def resolve_target_ratios(self, use_thumbnail: Optional[bool] = None): + image_processor = self.get_hf_processor().image_processor + min_dynamic_patch = image_processor.min_patches + max_dynamic_patch = image_processor.max_patches + # HF format's InternVL processor uses `crop_to_patches` which is + # equivalent to `use_thumbnail` in original format. + use_thumbnail = image_processor.crop_to_patches + dynamic_image_size = True + min_num, max_num = resolve_interns1_min_max_num( + min_dynamic_patch, + max_dynamic_patch, + dynamic_image_size, + use_thumbnail=use_thumbnail) + + return get_interns1_target_ratios(min_num, max_num) + + def get_image_size_with_most_features(self) -> ImageSize: + processor = self.get_hf_processor() + + hf_config = self.ctx.get_hf_config() + base_height, base_width = hf_config.vision_config.image_size + target_ratios = self.resolve_target_ratios() + + largest_feature_size, largest_feature_pinpoint = 0, None + for wr, hr in target_ratios: + width, height = base_width * wr, base_height * hr + + feat_size = self.get_num_image_tokens( + image_width=width, + image_height=height, + processor=processor.image_processor, + ) + if feat_size > largest_feature_size: + largest_feature_size = feat_size + largest_feature_pinpoint = ImageSize(width=width, + height=height) + + assert not (largest_feature_size == 0 or largest_feature_pinpoint + is None), ("Cannot have a largest feature size of 0!") + + return largest_feature_pinpoint + + def get_max_image_tokens(self) -> int: + processor = self.get_hf_processor() + target_width, target_height = self.get_image_size_with_most_features() + + return self.get_num_image_tokens( + image_width=target_width, + image_height=target_height, + processor=processor.image_processor, + ) + + +class InternS1DummyInputsBuilder(BaseDummyInputsBuilder[InternS1ProcessingInfo] + ): + """Basic image-only DummyInputsBuilder for InternS1-style models.""" + + def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: + num_images = mm_counts.get("image", 0) + image_token = self.info.get_hf_processor().image_token + + return image_token * num_images + + def get_dummy_mm_data( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> MultiModalDataDict: + target_width, target_height = \ + self.info.get_image_size_with_most_features() + num_images = mm_counts.get("image", 0) + + return { + "image": + self._get_dummy_images(width=target_width, + height=target_height, + num_images=num_images) + } + + +class InternS1MultiModalProcessor( + BaseMultiModalProcessor[InternS1ProcessingInfo]): + """ Basic image-only MultiModalProcessor for InternS1-style models.""" + + def _call_hf_processor( + self, + prompt: str, + mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object], + tok_kwargs: Mapping[str, object], + ) -> Mapping[str, NestedTensors]: + processed_outputs = super()._call_hf_processor( + prompt=prompt, + mm_data=mm_data, + mm_kwargs=mm_kwargs, + tok_kwargs=tok_kwargs, + ) + + hf_processor = self.info.get_hf_processor(**mm_kwargs) + image_token_id = hf_processor.image_token_id + + # Since there may be extra tokens in the feature placeholders, + # we need to pass the image token ID to the model to select the + # tokens to merge from the vision encoder outputs + processed_outputs["image_token_id"] = torch.tensor(image_token_id) + images = mm_data.get('images', None) + image_processor = self.info.get_hf_processor().image_processor + if images is not None: + image_inputs = image_processor(images=images) + image_num_patches = image_inputs.pop("num_patches") + if not isinstance(image_num_patches, list): + raise ValueError( + f'num_patches is supposed to be list, but got ' + f'{type(image_num_patches)}') + image_num_patches = torch.tensor(image_num_patches) + processed_outputs['image_num_patches'] = image_num_patches + + return processed_outputs + + def _get_mm_fields_config( + self, + hf_inputs: Mapping[str, NestedTensors], + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + + image_num_patches = hf_inputs.get("image_num_patches", torch.empty(0)) + num_images = len(image_num_patches) + + return dict( + pixel_values=MultiModalFieldConfig.flat_from_sizes( + "image", image_num_patches), + image_num_patches=MultiModalFieldConfig.batched("image"), + image_embeds=MultiModalFieldConfig.batched("image"), + image_token_id=MultiModalFieldConfig.shared("image", num_images), + ) + + def _get_prompt_updates( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, + ) -> Sequence[PromptUpdate]: + hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) + img_context_token = hf_processor.image_token + start_image_token = hf_processor.start_image_token + end_image_token = hf_processor.end_image_token + + def get_replacement(item_idx: int): + images = mm_items.get_items( + "image", (ImageEmbeddingItems, ImageProcessorItems)) + + if isinstance(images, ImageEmbeddingItems): + feature_size = images.get_feature_size(item_idx) + else: + image_size = images.get_image_size(item_idx) + feature_size = self.info.get_num_image_tokens( + image_width=image_size.width, + image_height=image_size.height, + processor=hf_processor.image_processor, + ) + + repl_features = img_context_token * feature_size + repl_full = start_image_token + repl_features + end_image_token + return PromptUpdateDetails.select_text(repl_full, + img_context_token) + + return [ + PromptReplacement( + modality="image", + target=img_context_token, + replacement=get_replacement, + ) + ] + + +@MULTIMODAL_REGISTRY.register_processor( + InternS1MultiModalProcessor, + info=InternS1ProcessingInfo, + dummy_inputs=InternS1DummyInputsBuilder) +class InternS1ForConditionalGeneration(nn.Module, SupportsMultiModal, + SupportsPP, SupportsLoRA): + + # To ensure correct weight loading and mapping. + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_prefix={ + "lm_head.": "language_model.lm_head.", + "model.language_model.": "language_model.model.", + "model.vision_tower.": "vision_tower.", + "model.multi_modal_projector.": "multi_modal_projector.", + }) + + @classmethod + def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]: + # transformers InternVLProcessor uses as the seperator + # refer to https://github.com/huggingface/transformers/blob/f90de364c2484c7c325bbe05befdcf487bd75b63/src/transformers/models/internvl/processing_internvl.py#L116 + if modality.startswith("image"): + return '' + if modality.startswith("video"): + return "