From bd89ce16d216f33d93cb72d0a88b2a98d726784a Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Wed, 24 Dec 2025 17:54:57 +0800 Subject: [PATCH 1/8] [Model] Introduce verify_and_update_model_config for VerifyAndUpdateConfig. (#31131) Signed-off-by: wang.yuqi Signed-off-by: wang.yuqi --- vllm/config/model.py | 19 +++++++++- vllm/model_executor/models/config.py | 57 +++++++++++++++------------- vllm/model_executor/models/llama.py | 3 -- 3 files changed, 48 insertions(+), 31 deletions(-) diff --git a/vllm/config/model.py b/vllm/config/model.py index c3e23de220949..ce554b136cef3 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -595,7 +595,7 @@ class ModelConfig: # Avoid running try_verify_and_update_config multiple times self.config_updated = False - + self._try_verify_and_update_model_config() self._verify_quantization() self._verify_cuda_graph() self._verify_bnb_config() @@ -1008,6 +1008,23 @@ class ModelConfig: "when expert parallelism is enabled." ) + def _try_verify_and_update_model_config(self): + # Avoid running try_verify_and_update_config multiple times + if getattr(self, "config_updated", False): + return + + architecture = self.architecture + if architecture is None: + return + + from vllm.model_executor.models.config import ( + MODELS_CONFIG_MAP, + ) + + cls = MODELS_CONFIG_MAP.get(architecture, None) + if cls is not None: + cls.verify_and_update_model_config(self) + def verify_dual_chunk_attention_config( self, load_config: LoadConfig, diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index d33b3fdf47467..10fd599f9e5f8 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -13,7 +13,7 @@ from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE from vllm.v1.kv_cache_interface import FullAttentionSpec, MambaSpec, MLAAttentionSpec if TYPE_CHECKING: - from vllm.config import VllmConfig + from vllm.config import ModelConfig, VllmConfig logger = init_logger(__name__) @@ -21,20 +21,24 @@ logger = init_logger(__name__) class VerifyAndUpdateConfig: @staticmethod def verify_and_update_config(vllm_config: "VllmConfig") -> None: - raise NotImplementedError + return - -class Gemma3TextModelConfig: @staticmethod - def verify_and_update_config(vllm_config: "VllmConfig") -> None: - hf_config = vllm_config.model_config.hf_config + def verify_and_update_model_config(model_config: "ModelConfig") -> None: + return + + +class Gemma3TextModelConfig(VerifyAndUpdateConfig): + @staticmethod + def verify_and_update_model_config(model_config: "ModelConfig") -> None: + hf_config = model_config.hf_config hf_config.is_causal = not hf_config.use_bidirectional_attention class GteNewModelConfig(VerifyAndUpdateConfig): @staticmethod - def verify_and_update_config(vllm_config: "VllmConfig") -> None: - config = vllm_config.model_config.hf_config + def verify_and_update_model_config(model_config: "ModelConfig") -> None: + config = model_config.hf_config assert config.__class__.__name__ == "NewConfig" assert config.hidden_act == "gelu" @@ -53,16 +57,15 @@ class GteNewModelConfig(VerifyAndUpdateConfig): class JambaForSequenceClassificationConfig(VerifyAndUpdateConfig): @staticmethod - def verify_and_update_config(vllm_config: "VllmConfig") -> None: - pooler_config = vllm_config.model_config.pooler_config + def verify_and_update_model_config(model_config: "ModelConfig") -> None: + pooler_config = model_config.pooler_config if pooler_config.use_activation is None: pooler_config.use_activation = False class JinaRobertaModelConfig(VerifyAndUpdateConfig): @staticmethod - def verify_and_update_config(vllm_config: "VllmConfig") -> None: - model_config = vllm_config.model_config + def verify_and_update_model_config(model_config: "ModelConfig") -> None: config = model_config.hf_config if config.position_embedding_type == "rotary": @@ -90,10 +93,10 @@ class JinaRobertaModelConfig(VerifyAndUpdateConfig): class LlamaBidirectionalConfig(VerifyAndUpdateConfig): @staticmethod - def verify_and_update_config(vllm_config: "VllmConfig") -> None: + def verify_and_update_model_config(model_config: "ModelConfig") -> None: from vllm.config.pooler import PoolingTypeStr - hf_config = vllm_config.model_config.hf_config + hf_config = model_config.hf_config hf_config.is_causal = False pooling_type_map: dict[str, PoolingTypeStr] = { @@ -105,7 +108,7 @@ class LlamaBidirectionalConfig(VerifyAndUpdateConfig): pooling_type = pooling_type_map.get(hf_config.pooling, None) if pooling_type is None: raise ValueError(f"pool_type {hf_config.pooling} not supported") - vllm_config.model_config.pooler_config.pooling_type = pooling_type + model_config.pooler_config.pooling_type = pooling_type class NomicBertModelConfig(VerifyAndUpdateConfig): @@ -204,8 +207,8 @@ class NomicBertModelConfig(VerifyAndUpdateConfig): class Qwen2ForProcessRewardModelConfig(VerifyAndUpdateConfig): @staticmethod - def verify_and_update_config(vllm_config: "VllmConfig") -> None: - pooler_config = vllm_config.model_config.pooler_config + def verify_and_update_model_config(model_config: "ModelConfig") -> None: + pooler_config = model_config.pooler_config if pooler_config.step_tag_id is None: pooler_config.step_tag_id = 151651 @@ -213,8 +216,8 @@ class Qwen2ForProcessRewardModelConfig(VerifyAndUpdateConfig): class Qwen2ForRewardModelConfig(VerifyAndUpdateConfig): @staticmethod - def verify_and_update_config(vllm_config: "VllmConfig") -> None: - pooler_config = vllm_config.model_config.pooler_config + def verify_and_update_model_config(model_config: "ModelConfig") -> None: + pooler_config = model_config.pooler_config if pooler_config.softmax is None: pooler_config.softmax = False @@ -222,8 +225,8 @@ class Qwen2ForRewardModelConfig(VerifyAndUpdateConfig): class Qwen3ForSequenceClassificationConfig(VerifyAndUpdateConfig): @staticmethod - def verify_and_update_config(vllm_config: "VllmConfig") -> None: - config = vllm_config.model_config.hf_config + def verify_and_update_model_config(model_config: "ModelConfig") -> None: + config = model_config.hf_config is_original_qwen3_reranker = getattr( config, "is_original_qwen3_reranker", False @@ -237,23 +240,23 @@ class Qwen3ForSequenceClassificationConfig(VerifyAndUpdateConfig): "Try loading the original Qwen3 Reranker?, see: " "https://github.com/vllm-project/vllm/tree/main/examples/offline_inference/offline_reranker.py" ) - vllm_config.model_config.hf_config.method = "from_2_way_softmax" + model_config.hf_config.method = "from_2_way_softmax" class JinaVLForSequenceClassificationConfig(VerifyAndUpdateConfig): @staticmethod - def verify_and_update_config(vllm_config: "VllmConfig") -> None: - config = vllm_config.model_config.hf_config + def verify_and_update_model_config(model_config: "ModelConfig") -> None: + config = model_config.hf_config config.num_labels = 1 - pooler_config = vllm_config.model_config.pooler_config + pooler_config = model_config.pooler_config if pooler_config.logit_bias is None: pooler_config.logit_bias = 2.65 class SnowflakeGteNewModelConfig(VerifyAndUpdateConfig): @staticmethod - def verify_and_update_config(vllm_config: "VllmConfig") -> None: - config = vllm_config.model_config.hf_config + def verify_and_update_model_config(model_config: "ModelConfig") -> None: + config = model_config.hf_config assert config.__class__.__name__ == "GteConfig" assert config.hidden_act == "gelu" diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 84f4211df4c20..f0f2983f84637 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -64,7 +64,6 @@ from .interfaces import ( SupportsLoRA, SupportsPP, ) -from .interfaces_base import attn_type from .utils import ( AutoWeightsLoader, PPMissingLayer, @@ -707,14 +706,12 @@ class LlamaForCausalLM( return name, loaded_weight -@attn_type("encoder_only") class LlamaBidirectionalForSequenceClassification(as_seq_cls_model(LlamaForCausalLM)): # This class sets the correct attention type and pooling type # through LlamaBidirectionalConfig. pass -@attn_type("encoder_only") class LlamaBidirectionalModel(as_embedding_model(LlamaForCausalLM)): # This class sets the correct attention type and pooling type # through LlamaBidirectionalConfig. From 7adeb4bfa8630773c84f9ad9e97830becd540896 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 24 Dec 2025 19:15:27 +0800 Subject: [PATCH 2/8] [Bugfix] Fix `max_model_len="auto"` handling (#31260) Signed-off-by: DarkLight1337 --- vllm/config/model.py | 2 +- vllm/engine/arg_utils.py | 41 +++++++----- vllm/v1/core/kv_cache_utils.py | 118 +++++++++++++++------------------ 3 files changed, 79 insertions(+), 82 deletions(-) diff --git a/vllm/config/model.py b/vllm/config/model.py index ce554b136cef3..a730aa8ad1b9c 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -164,7 +164,7 @@ class ModelConfig: """The specific revision to use for the tokenizer on the Hugging Face Hub. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version.""" - max_model_len: int = Field(default=None, gt=0) + max_model_len: int = Field(default=None, ge=-1) """Model context length (prompt and output). If unspecified, will be automatically derived from the model config. diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index a524a1611f008..1442c83a1504a 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -297,16 +297,14 @@ def _compute_kwargs(cls: ConfigType) -> dict[str, dict[str, Any]]: elif contains_type(type_hints, set): kwargs[name].update(collection_to_kwargs(type_hints, set)) elif contains_type(type_hints, int): - kwargs[name]["type"] = int - # Special case for large integers - human_readable_ints = { - "max_model_len", - "max_num_batched_tokens", - "kv_cache_memory_bytes", - } - if name in human_readable_ints: + if name == "max_model_len": + kwargs[name]["type"] = human_readable_int_or_auto + kwargs[name]["help"] += f"\n\n{human_readable_int_or_auto.__doc__}" + elif name in ("max_num_batched_tokens", "kv_cache_memory_bytes"): kwargs[name]["type"] = human_readable_int kwargs[name]["help"] += f"\n\n{human_readable_int.__doc__}" + else: + kwargs[name]["type"] = int elif contains_type(type_hints, float): kwargs[name]["type"] = float elif contains_type(type_hints, dict) and ( @@ -2042,23 +2040,17 @@ def _raise_unsupported_error(feature_name: str): raise NotImplementedError(msg) -def human_readable_int(value): +def human_readable_int(value: str) -> int: """Parse human-readable integers like '1k', '2M', etc. Including decimal values with decimal multipliers. - Also accepts -1 or 'auto' as a special value for auto-detection. Examples: - '1k' -> 1,000 - '1K' -> 1,024 - '25.6k' -> 25,600 - - '-1' or 'auto' -> -1 (special value for auto-detection) """ value = value.strip() - # Handle -1 or 'auto' as a special value for auto-detection - if value == "-1" or value.lower() == "auto": - return -1 - match = re.fullmatch(r"(\d+(?:\.\d+)?)([kKmMgGtT])", value) if match: decimal_multiplier = { @@ -2092,3 +2084,22 @@ def human_readable_int(value): # Regular plain number. return int(value) + + +def human_readable_int_or_auto(value: str) -> int: + """Parse human-readable integers like '1k', '2M', etc. + Including decimal values with decimal multipliers. + Also accepts -1 or 'auto' as a special value for auto-detection. + + Examples: + - '1k' -> 1,000 + - '1K' -> 1,024 + - '25.6k' -> 25,600 + - '-1' or 'auto' -> -1 (special value for auto-detection) + """ + value = value.strip() + + if value == "-1" or value.lower() == "auto": + return -1 + + return human_readable_int(value) diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 85afff38e486a..1480a1f798ea0 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -606,6 +606,43 @@ def get_request_block_hasher( return request_block_hasher +def _check_enough_kv_cache_memory( + available_memory: int, + get_needed_memory: Callable[[], int], + max_model_len: int, + estimate_max_model_len: Callable[[int], int], +): + if available_memory <= 0: + raise ValueError( + "No available memory for the cache blocks. " + "Try increasing `gpu_memory_utilization` when initializing the engine. " + "See https://docs.vllm.ai/en/latest/configuration/conserving_memory/ " + "for more details." + ) + + needed_memory = get_needed_memory() + + if needed_memory > available_memory: + estimated_max_len = estimate_max_model_len(available_memory) + estimated_msg = "" + if estimated_max_len > 0: + estimated_msg = ( + "Based on the available memory, " + f"the estimated maximum model length is {estimated_max_len}. " + ) + + raise ValueError( + f"To serve at least one request with the models's max seq len " + f"({max_model_len}), ({needed_memory / GiB_bytes:.2f} GiB KV " + f"cache is needed, which is larger than the available KV cache " + f"memory ({available_memory / GiB_bytes:.2f} GiB). {estimated_msg}" + f"Try increasing `gpu_memory_utilization` or decreasing `max_model_len` " + f"when initializing the engine. " + f"See https://docs.vllm.ai/en/latest/configuration/conserving_memory/ " + f"for more details." + ) + + def max_memory_usage_bytes( vllm_config: VllmConfig, kv_cache_specs: Iterable[KVCacheSpec] ) -> int: @@ -688,43 +725,12 @@ def check_enough_kv_cache_memory( """ # No need to check for available memory if the kv_cache_spec is empty - if not kv_cache_spec: - return - - if available_memory <= 0: - raise ValueError( - "No available memory for the cache blocks. " - "Try increasing `gpu_memory_utilization` when " - "initializing the engine. " - "See https://docs.vllm.ai/en/latest/configuration/conserving_memory/ " - "for more details." - ) - - max_model_len = vllm_config.model_config.max_model_len - needed_memory = max_memory_usage_bytes(vllm_config, kv_cache_spec.values()) - - if needed_memory > available_memory: - # Estimate the maximum model length that can fit in the available memory - estimated_max_len = estimate_max_model_len( - vllm_config, kv_cache_spec, available_memory - ) - estimated_msg = "" - if estimated_max_len > 0: - estimated_msg = ( - "Based on the available memory, " - f"the estimated maximum model length is {estimated_max_len}." - ) - - raise ValueError( - f"To serve at least one request with the models's max seq len " - f"({max_model_len}), ({needed_memory / GiB_bytes:.2f} GiB KV " - f"cache is needed, which is larger than the available KV cache " - f"memory ({available_memory / GiB_bytes:.2f} GiB). " - f"{estimated_msg} " - f"Try increasing `gpu_memory_utilization` or decreasing `max_model_len` " - f"when initializing the engine. " - f"See https://docs.vllm.ai/en/latest/configuration/conserving_memory/ " - f"for more details." + if kv_cache_spec: + _check_enough_kv_cache_memory( + available_memory, + lambda: max_memory_usage_bytes(vllm_config, kv_cache_spec.values()), + vllm_config.model_config.max_model_len, + lambda am: estimate_max_model_len(vllm_config, kv_cache_spec, am), ) @@ -1505,36 +1511,16 @@ def get_kv_cache_configs( # Check if the available memory is enough (using min across all workers). # We use the global groups to correctly account for padding. if global_kv_cache_groups: - min_available_memory = min(available_memory) - if min_available_memory <= 0: - raise ValueError( - "No available memory for the cache blocks. " - "Try increasing `gpu_memory_utilization` when " - "initializing the engine." - ) - max_model_len = vllm_config.model_config.max_model_len - needed_memory = _max_memory_usage_bytes_from_groups( - vllm_config, global_kv_cache_groups + _check_enough_kv_cache_memory( + min(available_memory), + lambda: _max_memory_usage_bytes_from_groups( + vllm_config, global_kv_cache_groups + ), + vllm_config.model_config.max_model_len, + lambda am: _estimate_max_model_len_from_groups( + vllm_config, global_kv_cache_groups, am + ), ) - if needed_memory > min_available_memory: - estimated_max_len = _estimate_max_model_len_from_groups( - vllm_config, global_kv_cache_groups, min_available_memory - ) - estimated_msg = "" - if estimated_max_len > 0: - estimated_msg = ( - f"Based on the available memory, the estimated maximum " - f"model length is {estimated_max_len}. " - ) - raise ValueError( - f"To serve at least one request with the models's max seq len " - f"({max_model_len}), ({needed_memory / GiB_bytes:.2f} GiB KV " - f"cache is needed, which is larger than the available KV cache " - f"memory ({min_available_memory / GiB_bytes:.2f} GiB). " - f"{estimated_msg}" - f"Try increasing `gpu_memory_utilization` or decreasing " - f"`max_model_len` when initializing the engine." - ) kv_cache_configs: list[KVCacheConfig] = [] for kv_cache_spec_one_worker, available_memory_one_worker in zip( From aa3868ecfe65036349412ce54fa5b07cb545d836 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 24 Dec 2025 21:38:46 +0800 Subject: [PATCH 3/8] [Chore] Remove unused `noqa`s (#31263) Signed-off-by: DarkLight1337 --- tests/conftest.py | 2 +- tests/entrypoints/openai/test_async_tokenization.py | 2 +- tests/entrypoints/openai/test_chat.py | 2 +- tests/entrypoints/openai/test_chat_with_tool_reasoning.py | 2 +- .../openai/test_completion_with_function_calling.py | 4 ++-- tests/entrypoints/openai/test_default_mm_loras.py | 2 +- tests/entrypoints/openai/test_enable_force_include_usage.py | 2 +- tests/entrypoints/openai/test_messages.py | 2 +- tests/entrypoints/openai/test_return_tokens_as_ids.py | 2 +- tests/models/multimodal/generation/test_qwen2_vl.py | 2 +- tests/v1/kv_connector/unit/test_example_connector.py | 2 +- .../ec_transfer/ec_connector/example_connector.py | 5 +---- vllm/entrypoints/serve/elastic_ep/api_router.py | 2 +- vllm/model_executor/model_loader/bitsandbytes_loader.py | 3 +-- vllm/model_executor/model_loader/runai_streamer_loader.py | 1 - vllm/v1/worker/ec_connector_model_runner_mixin.py | 4 +--- vllm/v1/worker/kv_connector_model_runner_mixin.py | 4 +--- 17 files changed, 17 insertions(+), 26 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index a03f40a9a72ac..30e25294925ca 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -410,7 +410,7 @@ class HfRunner: # don't put this import at the top level # it will call torch.cuda.device_count() - from transformers import AutoProcessor # noqa: F401 + from transformers import AutoProcessor self.processor = AutoProcessor.from_pretrained( model_name, diff --git a/tests/entrypoints/openai/test_async_tokenization.py b/tests/entrypoints/openai/test_async_tokenization.py index 682420a83a442..1d3d110d30271 100644 --- a/tests/entrypoints/openai/test_async_tokenization.py +++ b/tests/entrypoints/openai/test_async_tokenization.py @@ -15,7 +15,7 @@ MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct" @pytest.fixture(scope="module") -def server(): # noqa: F811 +def server(): args = [ # use half precision for speed and memory savings in CI environment "--dtype", diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py index ab78a79774564..ae94c149017e7 100644 --- a/tests/entrypoints/openai/test_chat.py +++ b/tests/entrypoints/openai/test_chat.py @@ -28,7 +28,7 @@ def zephyr_lora_files(): @pytest.fixture(scope="module") -def server(zephyr_lora_files): # noqa: F811 +def server(zephyr_lora_files): args = [ # use half precision for speed and memory savings in CI environment "--dtype", diff --git a/tests/entrypoints/openai/test_chat_with_tool_reasoning.py b/tests/entrypoints/openai/test_chat_with_tool_reasoning.py index 7b3092b563030..445fa389d0007 100644 --- a/tests/entrypoints/openai/test_chat_with_tool_reasoning.py +++ b/tests/entrypoints/openai/test_chat_with_tool_reasoning.py @@ -12,7 +12,7 @@ MODEL_NAME = "Qwen/QwQ-32B" @pytest.fixture(scope="module") -def server(): # noqa: F811 +def server(): args = [ "--max-model-len", "8192", diff --git a/tests/entrypoints/openai/test_completion_with_function_calling.py b/tests/entrypoints/openai/test_completion_with_function_calling.py index 53369f074eca8..c6a5841ec3bfb 100644 --- a/tests/entrypoints/openai/test_completion_with_function_calling.py +++ b/tests/entrypoints/openai/test_completion_with_function_calling.py @@ -125,7 +125,7 @@ messages = [ @pytest.fixture(scope="module") -def server(): # noqa: F811 +def server(): args = [ # use half precision for speed and memory savings in CI environment "--dtype", @@ -212,7 +212,7 @@ async def test_function_tool_use( @pytest.fixture(scope="module") -def k2_server(): # noqa: F811 +def k2_server(): args = [ # use half precision for speed and memory savings in CI environment "--dtype", diff --git a/tests/entrypoints/openai/test_default_mm_loras.py b/tests/entrypoints/openai/test_default_mm_loras.py index 818ee2644b547..dd8f9d67d6903 100644 --- a/tests/entrypoints/openai/test_default_mm_loras.py +++ b/tests/entrypoints/openai/test_default_mm_loras.py @@ -23,7 +23,7 @@ ACTIVE_MM_LORA_RESPONSE = "Spoken text: The first words I spoke in the original @pytest.fixture(scope="module") -def multimodal_server(): # noqa: F811 +def multimodal_server(): args = [ # use half precision for speed and memory savings in CI environment "--dtype", diff --git a/tests/entrypoints/openai/test_enable_force_include_usage.py b/tests/entrypoints/openai/test_enable_force_include_usage.py index 9d527c45c1fae..8e7e34ee2b71b 100644 --- a/tests/entrypoints/openai/test_enable_force_include_usage.py +++ b/tests/entrypoints/openai/test_enable_force_include_usage.py @@ -8,7 +8,7 @@ from ...utils import RemoteOpenAIServer @pytest.fixture(scope="module") -def chat_server_with_force_include_usage(request): # noqa: F811 +def chat_server_with_force_include_usage(request): args = [ # use half precision for speed and memory savings in CI environment "--dtype", diff --git a/tests/entrypoints/openai/test_messages.py b/tests/entrypoints/openai/test_messages.py index 8de6c4cb6c887..ce8c3ff4a71a5 100644 --- a/tests/entrypoints/openai/test_messages.py +++ b/tests/entrypoints/openai/test_messages.py @@ -11,7 +11,7 @@ MODEL_NAME = "Qwen/Qwen3-0.6B" @pytest.fixture(scope="module") -def server(): # noqa: F811 +def server(): args = [ "--max-model-len", "2048", diff --git a/tests/entrypoints/openai/test_return_tokens_as_ids.py b/tests/entrypoints/openai/test_return_tokens_as_ids.py index d4d9a6c5b6120..05a36febad0cc 100644 --- a/tests/entrypoints/openai/test_return_tokens_as_ids.py +++ b/tests/entrypoints/openai/test_return_tokens_as_ids.py @@ -37,7 +37,7 @@ def default_server_args(qwen3_lora_files): @pytest.fixture(scope="module") -def server_fixture(request, default_server_args): # noqa: F811 +def server_fixture(request, default_server_args): use_server_flag = request.param if use_server_flag: args_with_flag = default_server_args + ["--return-tokens-as-token-ids"] diff --git a/tests/models/multimodal/generation/test_qwen2_vl.py b/tests/models/multimodal/generation/test_qwen2_vl.py index e1b7dbf99f1fd..d46dd640229d0 100644 --- a/tests/models/multimodal/generation/test_qwen2_vl.py +++ b/tests/models/multimodal/generation/test_qwen2_vl.py @@ -267,7 +267,7 @@ def run_embedding_input_test( """Inference result should be the same between original image/video input and image/video embeddings input. """ - from transformers import AutoProcessor # noqa: F401 + from transformers import AutoProcessor processor = AutoProcessor.from_pretrained(model) diff --git a/tests/v1/kv_connector/unit/test_example_connector.py b/tests/v1/kv_connector/unit/test_example_connector.py index 8312231716935..d415608c95faa 100644 --- a/tests/v1/kv_connector/unit/test_example_connector.py +++ b/tests/v1/kv_connector/unit/test_example_connector.py @@ -145,7 +145,7 @@ def test_shared_storage_connector_hashes(tmp_path): # don't put this import at the top level # it will call torch.cuda.device_count() - from transformers import AutoProcessor # noqa: F401 + from transformers import AutoProcessor # Create processor to handle the chat prompt processor = AutoProcessor.from_pretrained(MODEL_NAME) diff --git a/vllm/distributed/ec_transfer/ec_connector/example_connector.py b/vllm/distributed/ec_transfer/ec_connector/example_connector.py index 3518044ce2e00..48a7d41908fd4 100644 --- a/vllm/distributed/ec_transfer/ec_connector/example_connector.py +++ b/vllm/distributed/ec_transfer/ec_connector/example_connector.py @@ -81,10 +81,7 @@ class ECExampleConnector(ECConnectorBase): assert encoder_cache is not None if metadata is None: logger.warning( - ( - "In connector.start_load_caches, ", - "but the connector metadata is None", - ) + "In connector.start_load_caches, but the connector metadata is None" ) return # Load the EC for each mm data diff --git a/vllm/entrypoints/serve/elastic_ep/api_router.py b/vllm/entrypoints/serve/elastic_ep/api_router.py index 21d5d2e60778a..e5adb81051ffd 100644 --- a/vllm/entrypoints/serve/elastic_ep/api_router.py +++ b/vllm/entrypoints/serve/elastic_ep/api_router.py @@ -43,7 +43,7 @@ async def scale_elastic_ep(raw_request: Request): try: body = await raw_request.json() except json.JSONDecodeError as e: - raise HTTPException(status_code=400, detail="Invalid JSON format") from e # noqa: B904 + raise HTTPException(status_code=400, detail="Invalid JSON format") from e new_data_parallel_size = body.get("new_data_parallel_size") drain_timeout = body.get("drain_timeout", 120) # Default 2 minutes diff --git a/vllm/model_executor/model_loader/bitsandbytes_loader.py b/vllm/model_executor/model_loader/bitsandbytes_loader.py index 97c7a20bc4d5a..aa020645021ea 100644 --- a/vllm/model_executor/model_loader/bitsandbytes_loader.py +++ b/vllm/model_executor/model_loader/bitsandbytes_loader.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -# ruff: noqa: SIM117 import fnmatch import glob import itertools @@ -59,7 +58,7 @@ def is_moe_model(model: torch.nn.Module) -> bool: class BitsAndBytesModelLoader(BaseModelLoader): - """Model loader to load model weights with BitAndBytes quantization.""" + """Model loader to load model weights with BitsAndBytes quantization.""" possible_config_file_names = ["adapter_config.json"] diff --git a/vllm/model_executor/model_loader/runai_streamer_loader.py b/vllm/model_executor/model_loader/runai_streamer_loader.py index 93da07c550195..fb33d3c6448bd 100644 --- a/vllm/model_executor/model_loader/runai_streamer_loader.py +++ b/vllm/model_executor/model_loader/runai_streamer_loader.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -# ruff: noqa: SIM117 import os from collections.abc import Generator diff --git a/vllm/v1/worker/ec_connector_model_runner_mixin.py b/vllm/v1/worker/ec_connector_model_runner_mixin.py index 08a41532ea8e1..1a347a0b98ab2 100644 --- a/vllm/v1/worker/ec_connector_model_runner_mixin.py +++ b/vllm/v1/worker/ec_connector_model_runner_mixin.py @@ -6,9 +6,7 @@ Define EC connector functionality mixin for model runners. from collections.abc import Generator from contextlib import AbstractContextManager, contextmanager, nullcontext -from typing import ( - TYPE_CHECKING, # noqa: UP035 -) +from typing import TYPE_CHECKING import torch diff --git a/vllm/v1/worker/kv_connector_model_runner_mixin.py b/vllm/v1/worker/kv_connector_model_runner_mixin.py index 2bcc87b63bcdf..7bb4ebe476ecf 100644 --- a/vllm/v1/worker/kv_connector_model_runner_mixin.py +++ b/vllm/v1/worker/kv_connector_model_runner_mixin.py @@ -7,9 +7,7 @@ Define KV connector functionality mixin for model runners. import copy from collections.abc import Generator from contextlib import AbstractContextManager, contextmanager, nullcontext -from typing import ( - TYPE_CHECKING, # noqa: UP035 -) +from typing import TYPE_CHECKING import torch From d201807339697c6c8206ae08d2cdccfc25cb1ce1 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 24 Dec 2025 21:39:13 +0800 Subject: [PATCH 4/8] [Chore] Bump `lm-eval` version (#31264) Signed-off-by: DarkLight1337 --- .../lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh | 2 +- .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh | 2 +- .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh | 2 +- .../lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh | 2 +- .buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh | 2 +- .buildkite/scripts/hardware_ci/run-tpu-v1-test.sh | 2 +- docs/features/quantization/fp8.md | 2 +- docs/features/quantization/int4.md | 2 +- docs/features/quantization/int8.md | 2 +- docs/features/quantization/quark.md | 2 +- requirements/nightly_torch_test.txt | 2 +- requirements/rocm-test.txt | 2 +- requirements/test.in | 3 +-- requirements/test.txt | 2 +- 14 files changed, 14 insertions(+), 15 deletions(-) diff --git a/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh index c8db951381b0b..0745da8dc418d 100755 --- a/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh +++ b/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh @@ -2,7 +2,7 @@ # We can use this script to compute baseline accuracy on chartqa for vllm. # # Make sure you have lm-eval-harness installed: -# pip install lm-eval==0.4.9 +# pip install "lm-eval[api]>=0.4.9.2" usage() { echo`` diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh index 897f84d1e360d..5c17a06245bcf 100755 --- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh +++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh @@ -2,7 +2,7 @@ # We can use this script to compute baseline accuracy on GSM for transformers. # # Make sure you have lm-eval-harness installed: -# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api] +# pip install "lm-eval[api]>=0.4.9.2" usage() { echo`` diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh index 792f355c47a51..1b617ff17c41c 100644 --- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh +++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh @@ -3,7 +3,7 @@ # We use this for fp8, which HF does not support. # # Make sure you have lm-eval-harness installed: -# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api] +# pip install "lm-eval[api]>=0.4.9.2" usage() { echo`` diff --git a/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh index d85a1721db9a5..12336d7f85bc9 100644 --- a/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh +++ b/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh @@ -3,7 +3,7 @@ # We use this for fp8, which HF does not support. # # Make sure you have lm-eval-harness installed: -# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api] +# pip install "lm-eval[api]>=0.4.9.2" usage() { echo`` diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh index cbb2527a4ff0a..6959f81eab373 100755 --- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh +++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh @@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR" echo "--- Installing Python dependencies ---" python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \ && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \ - && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \ + && python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.9.2" \ && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0 echo "--- Python dependencies installed ---" diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh index f022fa3672eeb..eafc82b98439b 100755 --- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh +++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh @@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR" echo "--- Installing Python dependencies ---" python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \ && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \ - && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \ + && python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.9.2" \ && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0 echo "--- Python dependencies installed ---" diff --git a/docs/features/quantization/fp8.md b/docs/features/quantization/fp8.md index d4a6176b236f1..f17ef89a5cbf9 100644 --- a/docs/features/quantization/fp8.md +++ b/docs/features/quantization/fp8.md @@ -84,7 +84,7 @@ Since simple RTN does not require data for weight quantization and the activatio Install `vllm` and `lm-evaluation-harness` for evaluation: ```bash -pip install vllm git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api] +pip install vllm "lm-eval[api]>=0.4.9.2" ``` Load and run the model in `vllm`: diff --git a/docs/features/quantization/int4.md b/docs/features/quantization/int4.md index 9752039097d63..049a7ceed079b 100644 --- a/docs/features/quantization/int4.md +++ b/docs/features/quantization/int4.md @@ -18,7 +18,7 @@ pip install llmcompressor Additionally, install `vllm` and `lm-evaluation-harness` for evaluation: ```bash -pip install vllm git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api] +pip install vllm "lm-eval[api]>=0.4.9.2" ``` ## Quantization Process diff --git a/docs/features/quantization/int8.md b/docs/features/quantization/int8.md index 701ca6378cb16..8af3e24c7357c 100644 --- a/docs/features/quantization/int8.md +++ b/docs/features/quantization/int8.md @@ -23,7 +23,7 @@ pip install llmcompressor Additionally, install `vllm` and `lm-evaluation-harness` for evaluation: ```bash -pip install vllm git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api] +pip install vllm "lm-eval[api]>=0.4.9.2" ``` ## Quantization Process diff --git a/docs/features/quantization/quark.md b/docs/features/quantization/quark.md index c54d7d2251999..bbab97740ff19 100644 --- a/docs/features/quantization/quark.md +++ b/docs/features/quantization/quark.md @@ -20,7 +20,7 @@ for more installation details. Additionally, install `vllm` and `lm-evaluation-harness` for evaluation: ```bash -pip install vllm git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api] +pip install vllm "lm-eval[api]>=0.4.9.2" ``` ## Quantization Process diff --git a/requirements/nightly_torch_test.txt b/requirements/nightly_torch_test.txt index 7b2c665448a3b..a5f6ac00d1c89 100644 --- a/requirements/nightly_torch_test.txt +++ b/requirements/nightly_torch_test.txt @@ -27,7 +27,7 @@ mistral_common[image,audio] >= 1.8.5 # required for voxtral test num2words # required for smolvlm test opencv-python-headless >= 4.11.0 # required for video test datamodel_code_generator # required for minicpm3 test -lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test +lm-eval[api]>=0.4.9.2 # required for model evaluation test mteb>=1.38.11, <2 # required for mteb test transformers==4.57.3 tokenizers==0.22.0 diff --git a/requirements/rocm-test.txt b/requirements/rocm-test.txt index 3f0fd235fba50..e4a3dd379d272 100644 --- a/requirements/rocm-test.txt +++ b/requirements/rocm-test.txt @@ -58,7 +58,7 @@ schemathesis==3.39.15 # OpenAI schema test # Evaluation and benchmarking -lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d +lm-eval[api]>=0.4.9.2 jiwer==4.0.0 # Required for multiprocessed tests that use spawn method, Datasets and Evaluate Test diff --git a/requirements/test.in b/requirements/test.in index 55452ce83f232..b3fd733fb1bc0 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -34,8 +34,7 @@ num2words # required for smolvlm test open_clip_torch==2.32.0 # Required for nemotron_vl test opencv-python-headless >= 4.11.0 # required for video test datamodel_code_generator # required for minicpm3 test -# TODO: Use lm-eval[api]==0.4.10 once released -lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test +lm-eval[api]>=0.4.9.2 # required for model evaluation test mteb[bm25s]>=2, <3 # required for mteb test transformers==4.57.3 tokenizers==0.22.0 diff --git a/requirements/test.txt b/requirements/test.txt index ea2093e4347fe..4012c2d3b212b 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -441,7 +441,7 @@ lightning-utilities==0.14.3 # torchmetrics llvmlite==0.44.0 # via numba -lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d +lm-eval==0.4.9.2 # via -r requirements/test.in lxml==5.3.0 # via From 7cd288a4b3f45f3524134b9915c35accd668c2f0 Mon Sep 17 00:00:00 2001 From: skaraban3807 Date: Wed, 24 Dec 2025 19:17:49 +0530 Subject: [PATCH 5/8] [PERF] Add interleaved memory allocation to NUMA module (#30800) --- csrc/cpu/utils.cpp | 51 ++++++++++++++++++++++++++++++++++------------ 1 file changed, 38 insertions(+), 13 deletions(-) diff --git a/csrc/cpu/utils.cpp b/csrc/cpu/utils.cpp index 88bc3c509790c..f2085b73b6a48 100644 --- a/csrc/cpu/utils.cpp +++ b/csrc/cpu/utils.cpp @@ -24,6 +24,8 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) { #ifndef VLLM_NUMA_DISABLED std::string init_cpu_threads_env(const std::string& cpu_ids) { bitmask* omp_cpu_mask = numa_parse_cpustring_all(cpu_ids.c_str()); + TORCH_CHECK(omp_cpu_mask != nullptr, + "Failed to parse CPU string: " + cpu_ids); TORCH_CHECK(omp_cpu_mask->size > 0); std::vector omp_cpu_ids; omp_cpu_ids.reserve(omp_cpu_mask->size); @@ -44,20 +46,12 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) { // Memory node binding if (numa_available() != -1) { - int mem_node_id = numa_node_of_cpu(omp_cpu_ids.front()); std::set node_ids; for (const auto& cpu_id : omp_cpu_ids) { int node_id = numa_node_of_cpu(cpu_id); if (node_id != -1) { node_ids.insert(node_id); } - if (node_id != mem_node_id) { - TORCH_WARN("CPU ", cpu_id, " is on NUMA node ", node_id, ", but CPU ", - omp_cpu_ids.front(), " is on NUMA node ", mem_node_id, - ". All CPUs should be on the same NUMA node for optimal " - "performance. Memory will be bound to NUMA node ", - mem_node_id, "."); - } } // Concatenate all node_ids into a single comma-separated string if (!node_ids.empty()) { @@ -70,7 +64,7 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) { } bitmask* mask = numa_parse_nodestring(node_ids_str.c_str()); - bitmask* src_mask = numa_get_membind(); + bitmask* src_mask = numa_get_mems_allowed(); int pid = getpid(); @@ -83,15 +77,46 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) { std::to_string(errno)); } - // restrict memory allocation node. - numa_set_membind(mask); + // Restrict memory allocation to the selected NUMA node(s). + // Enhances memory locality for the threads bound to those NUMA CPUs. + if (node_ids.size() > 1) { + errno = 0; + numa_set_interleave_mask(mask); + if (errno != 0) { + TORCH_WARN("numa_set_interleave_mask failed. errno: " + + std::to_string(errno)); + } else { + TORCH_WARN( + "NUMA binding: Using INTERLEAVE policy for memory " + "allocation across multiple NUMA nodes (nodes: " + + node_ids_str + + "). Memory allocations will be " + "interleaved across the specified NUMA nodes."); + } + } else { + errno = 0; + numa_set_membind(mask); + if (errno != 0) { + TORCH_WARN("numa_set_membind failed. errno: " + + std::to_string(errno)); + } else { + TORCH_WARN( + "NUMA binding: Using MEMBIND policy for memory " + "allocation on the NUMA nodes (" + + node_ids_str + + "). Memory allocations will be " + "strictly bound to these NUMA nodes."); + } + } + numa_set_strict(1); numa_free_nodemask(mask); numa_free_nodemask(src_mask); } else { - TORCH_WARN("numa_parse_nodestring or numa_get_membind failed. errno: " + - std::to_string(errno)); + TORCH_WARN( + "numa_parse_nodestring or numa_get_run_node_mask failed. errno: " + + std::to_string(errno)); } } } From 1ff67df1826a2e6d2d9be684151604c46f04947d Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Wed, 24 Dec 2025 23:36:20 +0800 Subject: [PATCH 6/8] [CI] Reorganization pooling_mteb_test (#31265) Signed-off-by: wang.yuqi --- .../pooling/embed/test_correctness_mteb.py | 2 +- .../pooling/score/test_correctness_mteb.py | 2 +- tests/entrypoints/pooling/score/test_utils.py | 9 +- .../pooling_mteb_test/mteb_embed_utils.py | 228 ++++++++++++++++++ .../{mteb_utils.py => mteb_score_utils.py} | 222 ++--------------- .../language/pooling_mteb_test/test_baai.py | 68 +++--- .../test_bge_reranker_v2_gemma.py | 14 +- .../pooling_mteb_test/test_cross_encoder.py | 16 +- .../language/pooling_mteb_test/test_gte.py | 70 ++++-- .../pooling_mteb_test/test_intfloat.py | 30 ++- .../language/pooling_mteb_test/test_jina.py | 17 +- .../pooling_mteb_test/test_mxbai_rerank.py | 12 +- .../pooling_mteb_test/test_nemotron.py | 22 +- .../language/pooling_mteb_test/test_nomic.py | 20 +- .../pooling_mteb_test/test_qwen3_reranker.py | 12 +- .../test_snowflake_arctic_embed.py | 40 ++- .../pooling_mteb_test/test_st_projector.py | 16 +- tests/models/utils.py | 27 +-- 18 files changed, 480 insertions(+), 347 deletions(-) create mode 100644 tests/models/language/pooling_mteb_test/mteb_embed_utils.py rename tests/models/language/pooling_mteb_test/{mteb_utils.py => mteb_score_utils.py} (51%) diff --git a/tests/entrypoints/pooling/embed/test_correctness_mteb.py b/tests/entrypoints/pooling/embed/test_correctness_mteb.py index 8cdd3d3c858d5..4c8d9f0d82a24 100644 --- a/tests/entrypoints/pooling/embed/test_correctness_mteb.py +++ b/tests/entrypoints/pooling/embed/test_correctness_mteb.py @@ -4,7 +4,7 @@ import os import pytest -from tests.models.language.pooling_mteb_test.mteb_utils import ( +from tests.models.language.pooling_mteb_test.mteb_embed_utils import ( MTEB_EMBED_TASKS, MTEB_EMBED_TOL, OpenAIClientMtebEncoder, diff --git a/tests/entrypoints/pooling/score/test_correctness_mteb.py b/tests/entrypoints/pooling/score/test_correctness_mteb.py index 71e75b93504ac..1ee45b44596fa 100644 --- a/tests/entrypoints/pooling/score/test_correctness_mteb.py +++ b/tests/entrypoints/pooling/score/test_correctness_mteb.py @@ -4,7 +4,7 @@ import os import pytest -from tests.models.language.pooling_mteb_test.mteb_utils import ( +from tests.models.language.pooling_mteb_test.mteb_score_utils import ( MTEB_RERANK_LANGS, MTEB_RERANK_TASKS, MTEB_RERANK_TOL, diff --git a/tests/entrypoints/pooling/score/test_utils.py b/tests/entrypoints/pooling/score/test_utils.py index 92b443c48825f..356fd0ad6678f 100644 --- a/tests/entrypoints/pooling/score/test_utils.py +++ b/tests/entrypoints/pooling/score/test_utils.py @@ -202,11 +202,10 @@ class TestGetScorePrompt: tokenization_kwargs, mock_model_no_score_template, ): - # FIXME: Models implementing SupportsScoreTemplate must use their custom - # template implementation by default to preserve existing functionality. - # Attempting to use tokenizer_config.json templates would most likely break - # these models, as often they just inherit the template from the original LLM. - # CLI --chat-template overrides are still supported. + # FIXME: For now, we only apply a template when one is explicitly provided. + # We cannot rely on the tokenizer's chat template because many models + # inherit junk templates from their base LLM, which breaks both the models + # and the tests that use them. with ( patch( "vllm.model_executor.model_loader.get_model_cls", diff --git a/tests/models/language/pooling_mteb_test/mteb_embed_utils.py b/tests/models/language/pooling_mteb_test/mteb_embed_utils.py new file mode 100644 index 0000000000000..a0b469f930644 --- /dev/null +++ b/tests/models/language/pooling_mteb_test/mteb_embed_utils.py @@ -0,0 +1,228 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import mteb +import numpy as np +import torch +from mteb.models import ModelMeta +from mteb.types import Array +from torch.utils.data import DataLoader + +import tests.ci_envs as ci_envs +from tests.models.utils import ( + EmbedModelInfo, + check_embeddings_close, + get_vllm_extra_kwargs, +) + +# Most embedding models on the STS12 task (See #17175): +# - Model implementation and minor changes in tensor dtype +# results in differences less than 1e-4 +# - Different model results in differences more than 1e-3 +# 1e-4 is a good tolerance threshold +MTEB_EMBED_TASKS = ["STS12"] +MTEB_EMBED_TOL = 1e-4 + + +_empty_model_meta = ModelMeta( + loader=None, + name="vllm/model", + revision="1", + release_date=None, + languages=None, + framework=[], + similarity_fn_name=None, + n_parameters=None, + memory_usage_mb=None, + max_tokens=None, + embed_dim=None, + license=None, + open_weights=None, + public_training_code=None, + public_training_data=None, + use_instructions=None, + training_datasets=None, + modalities=["text"], # 'image' can be added to evaluate multimodal models +) + + +class MtebEmbedMixin(mteb.EncoderProtocol): + mteb_model_meta = _empty_model_meta + + def similarity( + self, + embeddings1: np.ndarray, + embeddings2: np.ndarray, + ) -> np.ndarray: + # Cosine similarity + norm1 = np.linalg.norm(embeddings1, axis=1, keepdims=True) + norm2 = np.linalg.norm(embeddings2, axis=1, keepdims=True) + sim = np.dot(embeddings1, embeddings2.T) / (norm1 * norm2.T) + return sim + + def similarity_pairwise( + self, + embeddings1: Array, + embeddings2: Array, + ) -> Array: + # Cosine similarity + norm1 = np.linalg.norm(embeddings1, axis=1, keepdims=True) + norm2 = np.linalg.norm(embeddings2, axis=1, keepdims=True) + sim = np.sum(embeddings1 * embeddings2, axis=1) / ( + norm1.flatten() * norm2.flatten() + ) + return sim + + +class VllmMtebEncoder(MtebEmbedMixin): + def __init__(self, vllm_model): + self.llm = vllm_model + self.rng = np.random.default_rng(seed=42) + + def encode( + self, + inputs: DataLoader[mteb.types.BatchedInput], + *args, + **kwargs, + ) -> np.ndarray: + # Hoping to discover potential scheduling + # issues by randomizing the order. + sentences = [text for batch in inputs for text in batch["text"]] + r = self.rng.permutation(len(sentences)) + sentences = [sentences[i] for i in r] + outputs = self.llm.embed(sentences, use_tqdm=False) + embeds = np.array(outputs) + embeds = embeds[np.argsort(r)] + return embeds + + +class OpenAIClientMtebEncoder(MtebEmbedMixin): + def __init__(self, model_name: str, client): + self.model_name = model_name + self.client = client + self.rng = np.random.default_rng(seed=42) + + def encode( + self, + inputs: DataLoader[mteb.types.BatchedInput], + *args, + **kwargs, + ) -> np.ndarray: + # Hoping to discover potential scheduling + # issues by randomizing the order. + sentences = [text for batch in inputs for text in batch["text"]] + r = self.rng.permutation(len(sentences)) + sentences = [sentences[i] for i in r] + + embeddings = self.client.embeddings.create( + model=self.model_name, input=sentences + ) + outputs = [d.embedding for d in embeddings.data] + embeds = np.array(outputs) + embeds = embeds[np.argsort(r)] + return embeds + + +def run_mteb_embed_task(encoder: mteb.EncoderProtocol, tasks): + tasks = mteb.get_tasks(tasks=tasks) + results = mteb.evaluate( + encoder, + tasks, + cache=None, + show_progress_bar=False, + ) + + main_score = results[0].scores["test"][0]["main_score"] + return main_score + + +def mteb_test_embed_models( + hf_runner, + vllm_runner, + model_info: EmbedModelInfo, + vllm_extra_kwargs=None, + hf_model_callback=None, + atol=MTEB_EMBED_TOL, +): + vllm_extra_kwargs = get_vllm_extra_kwargs(model_info, vllm_extra_kwargs) + + # Test embed_dims, isnan and whether to use normalize + example_prompts = ["The chef prepared a delicious meal." * 1000] + + with vllm_runner( + model_info.name, + runner="pooling", + max_model_len=model_info.max_model_len, + **vllm_extra_kwargs, + ) as vllm_model: + model_config = vllm_model.llm.llm_engine.model_config + + # Confirm whether vllm is using the correct architecture + if model_info.architecture: + assert model_info.architecture in model_config.architectures + + # Confirm whether the important configs in model_config are correct. + if model_info.pooling_type is not None: + assert model_config.pooler_config.pooling_type == model_info.pooling_type + if model_info.attn_type is not None: + assert model_config.attn_type == model_info.attn_type + if model_info.is_prefix_caching_supported is not None: + assert ( + model_config.is_prefix_caching_supported + == model_info.is_prefix_caching_supported + ) + if model_info.is_chunked_prefill_supported is not None: + assert ( + model_config.is_chunked_prefill_supported + == model_info.is_chunked_prefill_supported + ) + + vllm_main_score = run_mteb_embed_task( + VllmMtebEncoder(vllm_model), MTEB_EMBED_TASKS + ) + vllm_dtype = vllm_model.llm.llm_engine.model_config.dtype + head_dtype = model_config.head_dtype + + # Test embedding_size, isnan and whether to use normalize + vllm_outputs = vllm_model.embed(example_prompts, truncate_prompt_tokens=-1) + outputs_tensor = torch.tensor(vllm_outputs) + assert not torch.any(torch.isnan(outputs_tensor)) + embedding_size = model_config.embedding_size + assert torch.tensor(vllm_outputs).shape[-1] == embedding_size + + # Accelerate mteb test by setting + # SentenceTransformers mteb score to a constant + if model_info.mteb_score is None: + with hf_runner( + model_info.name, + is_sentence_transformer=True, + dtype=ci_envs.VLLM_CI_HF_DTYPE or model_info.hf_dtype, + ) as hf_model: + # e.g. setting default parameters for the encode method of hf_runner + if hf_model_callback is not None: + hf_model_callback(hf_model) + + st_main_score = run_mteb_embed_task(hf_model, MTEB_EMBED_TASKS) + st_dtype = next(hf_model.model.parameters()).dtype + + # Check embeddings close to hf outputs + hf_outputs = hf_model.encode(example_prompts) + check_embeddings_close( + embeddings_0_lst=hf_outputs, + embeddings_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + tol=1e-2, + ) + else: + st_main_score = model_info.mteb_score + st_dtype = "Constant" + + print("Model:", model_info.name) + print("VLLM:", f"dtype:{vllm_dtype}", f"head_dtype:{head_dtype}", vllm_main_score) + print("SentenceTransformers:", st_dtype, st_main_score) + print("Difference:", st_main_score - vllm_main_score) + + # We are not concerned that the vllm mteb results are better + # than SentenceTransformers, so we only perform one-sided testing. + assert st_main_score - vllm_main_score < atol diff --git a/tests/models/language/pooling_mteb_test/mteb_utils.py b/tests/models/language/pooling_mteb_test/mteb_score_utils.py similarity index 51% rename from tests/models/language/pooling_mteb_test/mteb_utils.py rename to tests/models/language/pooling_mteb_test/mteb_score_utils.py index 11e05a635c1d1..6c13502317736 100644 --- a/tests/models/language/pooling_mteb_test/mteb_utils.py +++ b/tests/models/language/pooling_mteb_test/mteb_score_utils.py @@ -7,37 +7,24 @@ from pathlib import Path import mteb import numpy as np import requests -import torch from mteb.models import ModelMeta -from mteb.types import Array from torch.utils.data import DataLoader -import tests.ci_envs as ci_envs from tests.models.utils import ( - EmbedModelInfo, RerankModelInfo, - check_embeddings_close, get_vllm_extra_kwargs, ) -template_home = ( - Path(__file__).parent.parent.parent.parent.parent - / "examples/pooling/score/template" -) - -# Most embedding models on the STS12 task (See #17175): -# - Model implementation and minor changes in tensor dtype -# results in differences less than 1e-4 -# - Different model results in differences more than 1e-3 -# 1e-4 is a good tolerance threshold -MTEB_EMBED_TASKS = ["STS12"] -MTEB_EMBED_TOL = 1e-4 - # See #19344 MTEB_RERANK_TASKS = ["NFCorpus"] MTEB_RERANK_LANGS = ["eng"] MTEB_RERANK_TOL = 2e-3 +template_home = ( + Path(__file__).parent.parent.parent.parent.parent + / "examples/pooling/score/template" +) + _empty_model_meta = ModelMeta( loader=None, name="vllm/model", @@ -60,84 +47,11 @@ _empty_model_meta = ModelMeta( ) -class VllmMtebEncoder(mteb.EncoderProtocol): +class MtebCrossEncoderMixin(mteb.CrossEncoderProtocol): mteb_model_meta = _empty_model_meta - def __init__(self, vllm_model): - self.llm = vllm_model - self.rng = np.random.default_rng(seed=42) - - def encode( - self, - inputs: DataLoader[mteb.types.BatchedInput], - *args, - **kwargs, - ) -> np.ndarray: - # Hoping to discover potential scheduling - # issues by randomizing the order. - sentences = [text for batch in inputs for text in batch["text"]] - r = self.rng.permutation(len(sentences)) - sentences = [sentences[i] for i in r] - outputs = self.llm.embed(sentences, use_tqdm=False) - embeds = np.array(outputs) - embeds = embeds[np.argsort(r)] - return embeds - - def similarity( - self, - embeddings1: np.ndarray, - embeddings2: np.ndarray, - ) -> np.ndarray: - # Cosine similarity - norm1 = np.linalg.norm(embeddings1, axis=1, keepdims=True) - norm2 = np.linalg.norm(embeddings2, axis=1, keepdims=True) - sim = np.dot(embeddings1, embeddings2.T) / (norm1 * norm2.T) - return sim - - def similarity_pairwise( - self, - embeddings1: Array, - embeddings2: Array, - ) -> Array: - # Cosine similarity - norm1 = np.linalg.norm(embeddings1, axis=1, keepdims=True) - norm2 = np.linalg.norm(embeddings2, axis=1, keepdims=True) - sim = np.sum(embeddings1 * embeddings2, axis=1) / ( - norm1.flatten() * norm2.flatten() - ) - return sim - - -class OpenAIClientMtebEncoder(VllmMtebEncoder): - def __init__(self, model_name: str, client): - self.model_name = model_name - self.client = client - self.rng = np.random.default_rng(seed=42) - - def encode( - self, - inputs: DataLoader[mteb.types.BatchedInput], - *args, - **kwargs, - ) -> np.ndarray: - # Hoping to discover potential scheduling - # issues by randomizing the order. - sentences = [text for batch in inputs for text in batch["text"]] - r = self.rng.permutation(len(sentences)) - sentences = [sentences[i] for i in r] - - embeddings = self.client.embeddings.create( - model=self.model_name, input=sentences - ) - outputs = [d.embedding for d in embeddings.data] - embeds = np.array(outputs) - embeds = embeds[np.argsort(r)] - return embeds - - -class VllmMtebCrossEncoder(mteb.CrossEncoderProtocol): - mteb_model_meta = _empty_model_meta +class VllmMtebCrossEncoder(MtebCrossEncoderMixin): def __init__(self, vllm_model): self.llm = vllm_model self.rng = np.random.default_rng(seed=42) @@ -164,7 +78,7 @@ class VllmMtebCrossEncoder(mteb.CrossEncoderProtocol): return scores -class ScoreClientMtebEncoder(mteb.CrossEncoderProtocol): +class ScoreClientMtebEncoder(MtebCrossEncoderMixin): mteb_model_meta = _empty_model_meta def __init__(self, model_name: str, url): @@ -216,102 +130,6 @@ class RerankClientMtebEncoder(ScoreClientMtebEncoder): return response["results"][0]["relevance_score"] -def run_mteb_embed_task(encoder: mteb.EncoderProtocol, tasks): - tasks = mteb.get_tasks(tasks=tasks) - results = mteb.evaluate( - encoder, - tasks, - cache=None, - show_progress_bar=False, - ) - - main_score = results[0].scores["test"][0]["main_score"] - return main_score - - -def mteb_test_embed_models( - hf_runner, - vllm_runner, - model_info: EmbedModelInfo, - vllm_extra_kwargs=None, - hf_model_callback=None, - atol=MTEB_EMBED_TOL, -): - vllm_extra_kwargs = get_vllm_extra_kwargs(model_info, vllm_extra_kwargs) - - # Test embed_dims, isnan and whether to use normalize - example_prompts = ["The chef prepared a delicious meal." * 1000] - - with vllm_runner( - model_info.name, - runner="pooling", - max_model_len=model_info.max_model_len, - **vllm_extra_kwargs, - ) as vllm_model: - model_config = vllm_model.llm.llm_engine.model_config - - # Confirm whether vllm is using the correct architecture - if model_info.architecture: - assert model_info.architecture in model_config.architectures - - # Confirm whether vllm uses the correct default_pooling_type, which - # relates to whether chunked prefill and prefix caching are enabled - assert ( - model_config._model_info.default_pooling_type - == model_info.default_pooling_type - ) - - vllm_main_score = run_mteb_embed_task( - VllmMtebEncoder(vllm_model), MTEB_EMBED_TASKS - ) - vllm_dtype = vllm_model.llm.llm_engine.model_config.dtype - head_dtype = model_config.head_dtype - - # Test embedding_size, isnan and whether to use normalize - vllm_outputs = vllm_model.embed(example_prompts, truncate_prompt_tokens=-1) - outputs_tensor = torch.tensor(vllm_outputs) - assert not torch.any(torch.isnan(outputs_tensor)) - embedding_size = model_config.embedding_size - assert torch.tensor(vllm_outputs).shape[-1] == embedding_size - - # Accelerate mteb test by setting - # SentenceTransformers mteb score to a constant - if model_info.mteb_score is None: - with hf_runner( - model_info.name, - is_sentence_transformer=True, - dtype=ci_envs.VLLM_CI_HF_DTYPE or model_info.hf_dtype, - ) as hf_model: - # e.g. setting default parameters for the encode method of hf_runner - if hf_model_callback is not None: - hf_model_callback(hf_model) - - st_main_score = run_mteb_embed_task(hf_model, MTEB_EMBED_TASKS) - st_dtype = next(hf_model.model.parameters()).dtype - - # Check embeddings close to hf outputs - hf_outputs = hf_model.encode(example_prompts) - check_embeddings_close( - embeddings_0_lst=hf_outputs, - embeddings_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - tol=1e-2, - ) - else: - st_main_score = model_info.mteb_score - st_dtype = "Constant" - - print("Model:", model_info.name) - print("VLLM:", f"dtype:{vllm_dtype}", f"head_dtype:{head_dtype}", vllm_main_score) - print("SentenceTransformers:", st_dtype, st_main_score) - print("Difference:", st_main_score - vllm_main_score) - - # We are not concerned that the vllm mteb results are better - # than SentenceTransformers, so we only perform one-sided testing. - assert st_main_score - vllm_main_score < atol - - def run_mteb_rerank(cross_encoder: mteb.CrossEncoderProtocol, tasks, languages): with tempfile.TemporaryDirectory() as prediction_folder: bm25s = mteb.get_model("bm25s") @@ -391,18 +209,28 @@ def mteb_test_rerank_models( # Score API is only enabled for num_labels == 1 assert model_config.hf_config.num_labels == 1 - # Confirm whether vllm uses the correct default_pooling_type, which - # relates to whether chunked prefill and prefix caching are enabled - assert ( - model_config._model_info.default_pooling_type - == model_info.default_pooling_type - ) - + # Maybe load chat_template. chat_template: str | None = None if model_info.chat_template_name is not None: chat_template = (template_home / model_info.chat_template_name).read_text() vllm_model.chat_template = chat_template + # Confirm whether the important configs in model_config are correct. + if model_info.pooling_type is not None: + assert model_config.pooler_config.pooling_type == model_info.pooling_type + if model_info.attn_type is not None: + assert model_config.attn_type == model_info.attn_type + if model_info.is_prefix_caching_supported is not None: + assert ( + model_config.is_prefix_caching_supported + == model_info.is_prefix_caching_supported + ) + if model_info.is_chunked_prefill_supported is not None: + assert ( + model_config.is_chunked_prefill_supported + == model_info.is_chunked_prefill_supported + ) + vllm_main_score = run_mteb_rerank( vllm_mteb_encoder(vllm_model), tasks=MTEB_RERANK_TASKS, diff --git a/tests/models/language/pooling_mteb_test/test_baai.py b/tests/models/language/pooling_mteb_test/test_baai.py index bad13e2457146..2e55622a5d48c 100644 --- a/tests/models/language/pooling_mteb_test/test_baai.py +++ b/tests/models/language/pooling_mteb_test/test_baai.py @@ -4,90 +4,94 @@ import pytest from tests.models.language.pooling.embed_utils import correctness_test_embed_models from tests.models.utils import ( - CLSPoolingEmbedModelInfo, - CLSPoolingRerankModelInfo, EmbedModelInfo, - LASTPoolingEmbedModelInfo, RerankModelInfo, ) -from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models +from .mteb_embed_utils import mteb_test_embed_models +from .mteb_score_utils import mteb_test_rerank_models MODELS = [ ########## BertModel - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "BAAI/bge-base-en", architecture="BertModel", mteb_score=0.779336792, + pooling_type="CLS", + attn_type="encoder_only", + is_prefix_caching_supported=False, + is_chunked_prefill_supported=False, enable_test=True, ), - CLSPoolingEmbedModelInfo( - "BAAI/bge-base-zh", architecture="BertModel", enable_test=False - ), - CLSPoolingEmbedModelInfo( - "BAAI/bge-small-en", architecture="BertModel", enable_test=False - ), - CLSPoolingEmbedModelInfo( - "BAAI/bge-small-zh", architecture="BertModel", enable_test=False - ), - CLSPoolingEmbedModelInfo( - "BAAI/bge-large-en", architecture="BertModel", enable_test=False - ), - CLSPoolingEmbedModelInfo( - "BAAI/bge-large-zh", architecture="BertModel", enable_test=False - ), - CLSPoolingEmbedModelInfo( + EmbedModelInfo("BAAI/bge-base-zh", architecture="BertModel", enable_test=False), + EmbedModelInfo("BAAI/bge-small-en", architecture="BertModel", enable_test=False), + EmbedModelInfo("BAAI/bge-small-zh", architecture="BertModel", enable_test=False), + EmbedModelInfo("BAAI/bge-large-en", architecture="BertModel", enable_test=False), + EmbedModelInfo("BAAI/bge-large-zh", architecture="BertModel", enable_test=False), + EmbedModelInfo( "BAAI/bge-large-zh-noinstruct", architecture="BertModel", enable_test=False ), - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "BAAI/bge-base-en-v1.5", architecture="BertModel", enable_test=False ), - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "BAAI/bge-base-zh-v1.5", architecture="BertModel", enable_test=False ), - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "BAAI/bge-small-en-v1.5", architecture="BertModel", enable_test=False ), - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "BAAI/bge-small-zh-v1.5", architecture="BertModel", enable_test=False ), - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "BAAI/bge-large-en-v1.5", architecture="BertModel", enable_test=False ), - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "BAAI/bge-large-zh-v1.5", architecture="BertModel", enable_test=False ), ########## XLMRobertaModel - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "BAAI/bge-m3", architecture="XLMRobertaModel", mteb_score=0.787343078, + pooling_type="CLS", + attn_type="encoder_only", + is_prefix_caching_supported=False, + is_chunked_prefill_supported=False, enable_test=True, ), ########## Qwen2Model - LASTPoolingEmbedModelInfo( + EmbedModelInfo( "BAAI/bge-code-v1", architecture="Qwen2Model", mteb_score=0.75724465, dtype="float32", + pooling_type="LAST", + attn_type="decoder", + is_prefix_caching_supported=True, + is_chunked_prefill_supported=True, enable_test=True, ), ] RERANK_MODELS = [ ########## XLMRobertaForSequenceClassification - CLSPoolingRerankModelInfo( + RerankModelInfo( "BAAI/bge-reranker-base", architecture="XLMRobertaForSequenceClassification", mteb_score=0.32398, + pooling_type="CLS", + attn_type="encoder_only", + is_prefix_caching_supported=False, + is_chunked_prefill_supported=False, enable_test=True, ), - CLSPoolingRerankModelInfo( + RerankModelInfo( "BAAI/bge-reranker-large", architecture="XLMRobertaForSequenceClassification", enable_test=False, ), - CLSPoolingRerankModelInfo( + RerankModelInfo( "BAAI/bge-reranker-v2-m3", architecture="XLMRobertaForSequenceClassification", enable_test=False, diff --git a/tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py b/tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py index 6b2e469644926..00f2d33546efc 100644 --- a/tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py +++ b/tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py @@ -9,14 +9,12 @@ import torch from torch.utils.data import DataLoader from tests.conftest import HfRunner -from tests.models.language.pooling_mteb_test.mteb_utils import ( - VllmMtebCrossEncoder, - mteb_test_rerank_models, -) -from tests.models.utils import LASTPoolingRerankModelInfo, RerankModelInfo +from tests.models.utils import RerankModelInfo + +from .mteb_score_utils import VllmMtebCrossEncoder, mteb_test_rerank_models RERANK_MODELS = [ - LASTPoolingRerankModelInfo( + RerankModelInfo( "BAAI/bge-reranker-v2-gemma", architecture="GemmaForSequenceClassification", mteb_score=0.33757, @@ -25,6 +23,10 @@ RERANK_MODELS = [ "classifier_from_token": ["Yes"], "method": "no_post_processing", }, + pooling_type="LAST", + attn_type="decoder", + is_prefix_caching_supported=True, + is_chunked_prefill_supported=True, ), ] diff --git a/tests/models/language/pooling_mteb_test/test_cross_encoder.py b/tests/models/language/pooling_mteb_test/test_cross_encoder.py index 638ffc7a62b0e..8bca49bb5b023 100644 --- a/tests/models/language/pooling_mteb_test/test_cross_encoder.py +++ b/tests/models/language/pooling_mteb_test/test_cross_encoder.py @@ -3,23 +3,29 @@ import pytest from tests.models.utils import ( - CLSPoolingRerankModelInfo, - LASTPoolingRerankModelInfo, RerankModelInfo, ) -from .mteb_utils import mteb_test_rerank_models +from .mteb_score_utils import mteb_test_rerank_models RERANK_MODELS = [ - CLSPoolingRerankModelInfo( + RerankModelInfo( "cross-encoder/ms-marco-TinyBERT-L-2-v2", mteb_score=0.32898, architecture="BertForSequenceClassification", + pooling_type="CLS", + attn_type="encoder_only", + is_prefix_caching_supported=False, + is_chunked_prefill_supported=False, ), - LASTPoolingRerankModelInfo( + RerankModelInfo( "tomaarsen/Qwen3-Reranker-0.6B-seq-cls", mteb_score=0.25736, architecture="Qwen3ForSequenceClassification", + pooling_type="LAST", + attn_type="decoder", + is_prefix_caching_supported=True, + is_chunked_prefill_supported=True, ), ] diff --git a/tests/models/language/pooling_mteb_test/test_gte.py b/tests/models/language/pooling_mteb_test/test_gte.py index a22821fd65b5a..3d1d5aa84091e 100644 --- a/tests/models/language/pooling_mteb_test/test_gte.py +++ b/tests/models/language/pooling_mteb_test/test_gte.py @@ -5,36 +5,32 @@ import pytest from tests.models.language.pooling.embed_utils import correctness_test_embed_models from tests.models.utils import ( - CLSPoolingEmbedModelInfo, - CLSPoolingRerankModelInfo, EmbedModelInfo, - LASTPoolingEmbedModelInfo, RerankModelInfo, ) -from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models +from .mteb_embed_utils import mteb_test_embed_models +from .mteb_score_utils import mteb_test_rerank_models MODELS = [ ########## BertModel - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "thenlper/gte-large", mteb_score=0.76807651, architecture="BertModel", + pooling_type="MEAN", + attn_type="encoder_only", + is_prefix_caching_supported=False, + is_chunked_prefill_supported=False, enable_test=True, ), - CLSPoolingEmbedModelInfo( - "thenlper/gte-base", architecture="BertModel", enable_test=False - ), - CLSPoolingEmbedModelInfo( - "thenlper/gte-small", architecture="BertModel", enable_test=False - ), - CLSPoolingEmbedModelInfo( + EmbedModelInfo("thenlper/gte-base", architecture="BertModel", enable_test=False), + EmbedModelInfo("thenlper/gte-small", architecture="BertModel", enable_test=False), + EmbedModelInfo( "thenlper/gte-large-zh", architecture="BertModel", enable_test=False ), - CLSPoolingEmbedModelInfo( - "thenlper/gte-base-zh", architecture="BertModel", enable_test=False - ), - CLSPoolingEmbedModelInfo( + EmbedModelInfo("thenlper/gte-base-zh", architecture="BertModel", enable_test=False), + EmbedModelInfo( "thenlper/gte-small-zh", architecture="BertModel", enable_test=False ), ########### NewModel @@ -43,48 +39,64 @@ MODELS = [ # - whether to use token_type_embeddings # - whether to use context expansion # So only test one (the most widely used) model - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "Alibaba-NLP/gte-multilingual-base", architecture="GteNewModel", mteb_score=0.775074696, hf_overrides={"architectures": ["GteNewModel"]}, + pooling_type="CLS", + attn_type="encoder_only", + is_prefix_caching_supported=False, + is_chunked_prefill_supported=False, enable_test=True, ), - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "Alibaba-NLP/gte-base-en-v1.5", architecture="GteNewModel", hf_overrides={"architectures": ["GteNewModel"]}, enable_test=False, ), - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "Alibaba-NLP/gte-large-en-v1.5", architecture="GteNewModel", hf_overrides={"architectures": ["GteNewModel"]}, enable_test=False, ), ########### Qwen2ForCausalLM - LASTPoolingEmbedModelInfo( + EmbedModelInfo( "Alibaba-NLP/gte-Qwen2-1.5B-instruct", mteb_score=0.758473459018872, architecture="Qwen2ForCausalLM", + pooling_type="LAST", + attn_type="encoder_only", + is_prefix_caching_supported=False, + is_chunked_prefill_supported=False, enable_test=True, ), ########## ModernBertModel - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "Alibaba-NLP/gte-modernbert-base", mteb_score=0.748193353, architecture="ModernBertModel", + pooling_type="CLS", + attn_type="encoder_only", + is_prefix_caching_supported=False, + is_chunked_prefill_supported=False, enable_test=True, ), ########## Qwen3ForCausalLM - LASTPoolingEmbedModelInfo( + EmbedModelInfo( "Qwen/Qwen3-Embedding-0.6B", mteb_score=0.771163695, architecture="Qwen3ForCausalLM", dtype="float32", + pooling_type="LAST", + attn_type="decoder", + is_prefix_caching_supported=True, + is_chunked_prefill_supported=True, enable_test=True, ), - LASTPoolingEmbedModelInfo( + EmbedModelInfo( "Qwen/Qwen3-Embedding-4B", architecture="Qwen3ForCausalLM", dtype="float32", @@ -93,18 +105,26 @@ MODELS = [ ] RERANK_MODELS = [ - CLSPoolingRerankModelInfo( + RerankModelInfo( # classifier_pooling: mean "Alibaba-NLP/gte-reranker-modernbert-base", mteb_score=0.33386, architecture="ModernBertForSequenceClassification", + pooling_type="CLS", + attn_type="encoder_only", + is_prefix_caching_supported=False, + is_chunked_prefill_supported=False, enable_test=True, ), - CLSPoolingRerankModelInfo( + RerankModelInfo( "Alibaba-NLP/gte-multilingual-reranker-base", mteb_score=0.33062, architecture="GteNewForSequenceClassification", hf_overrides={"architectures": ["GteNewForSequenceClassification"]}, + pooling_type="CLS", + attn_type="encoder_only", + is_prefix_caching_supported=False, + is_chunked_prefill_supported=False, enable_test=True, ), ] diff --git a/tests/models/language/pooling_mteb_test/test_intfloat.py b/tests/models/language/pooling_mteb_test/test_intfloat.py index 1d078db69236a..377ab600aa443 100644 --- a/tests/models/language/pooling_mteb_test/test_intfloat.py +++ b/tests/models/language/pooling_mteb_test/test_intfloat.py @@ -3,40 +3,44 @@ import pytest from tests.models.language.pooling.embed_utils import correctness_test_embed_models -from tests.models.utils import CLSPoolingEmbedModelInfo, EmbedModelInfo +from tests.models.utils import EmbedModelInfo -from .mteb_utils import mteb_test_embed_models +from .mteb_embed_utils import mteb_test_embed_models MODELS = [ ########## BertModel - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "intfloat/e5-small", architecture="BertModel", mteb_score=0.742285423, + pooling_type="MEAN", + attn_type="encoder_only", + is_prefix_caching_supported=False, + is_chunked_prefill_supported=False, enable_test=True, ), - CLSPoolingEmbedModelInfo( - "intfloat/e5-base", architecture="BertModel", enable_test=False - ), - CLSPoolingEmbedModelInfo( - "intfloat/e5-large", architecture="BertModel", enable_test=False - ), - CLSPoolingEmbedModelInfo( + EmbedModelInfo("intfloat/e5-base", architecture="BertModel", enable_test=False), + EmbedModelInfo("intfloat/e5-large", architecture="BertModel", enable_test=False), + EmbedModelInfo( "intfloat/multilingual-e5-small", architecture="BertModel", enable_test=False ), ########## XLMRobertaModel - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "intfloat/multilingual-e5-base", architecture="XLMRobertaModel", mteb_score=0.779325955, + pooling_type="MEAN", + attn_type="encoder_only", + is_prefix_caching_supported=False, + is_chunked_prefill_supported=False, enable_test=True, ), - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "intfloat/multilingual-e5-large", architecture="XLMRobertaModel", enable_test=False, ), - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "intfloat/multilingual-e5-large-instruct", architecture="XLMRobertaModel", enable_test=False, diff --git a/tests/models/language/pooling_mteb_test/test_jina.py b/tests/models/language/pooling_mteb_test/test_jina.py index c2065bcd6eb4c..b98ac91b97573 100644 --- a/tests/models/language/pooling_mteb_test/test_jina.py +++ b/tests/models/language/pooling_mteb_test/test_jina.py @@ -10,30 +10,37 @@ from tests.models.language.pooling.embed_utils import ( matryoshka_fy, ) from tests.models.utils import ( - CLSPoolingEmbedModelInfo, - CLSPoolingRerankModelInfo, EmbedModelInfo, RerankModelInfo, ) from vllm import PoolingParams -from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models +from .mteb_embed_utils import mteb_test_embed_models +from .mteb_score_utils import mteb_test_rerank_models EMBEDDING_MODELS = [ - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "jinaai/jina-embeddings-v3", mteb_score=0.824413164, architecture="XLMRobertaModel", is_matryoshka=True, + pooling_type="MEAN", + attn_type="encoder_only", + is_prefix_caching_supported=False, + is_chunked_prefill_supported=False, dtype="float32", ) ] RERANK_MODELS = [ - CLSPoolingRerankModelInfo( + RerankModelInfo( "jinaai/jina-reranker-v2-base-multilingual", mteb_score=0.33643, architecture="XLMRobertaForSequenceClassification", + pooling_type="CLS", + attn_type="encoder_only", + is_prefix_caching_supported=False, + is_chunked_prefill_supported=False, ) ] diff --git a/tests/models/language/pooling_mteb_test/test_mxbai_rerank.py b/tests/models/language/pooling_mteb_test/test_mxbai_rerank.py index a6f2a89b268f1..50dc6a0bd0ad1 100644 --- a/tests/models/language/pooling_mteb_test/test_mxbai_rerank.py +++ b/tests/models/language/pooling_mteb_test/test_mxbai_rerank.py @@ -6,9 +6,9 @@ import pytest import torch from tests.conftest import HfRunner -from tests.models.utils import LASTPoolingRerankModelInfo, RerankModelInfo +from tests.models.utils import RerankModelInfo -from .mteb_utils import mteb_test_rerank_models +from .mteb_score_utils import mteb_test_rerank_models mxbai_rerank_hf_overrides = { "architectures": ["Qwen2ForSequenceClassification"], @@ -17,14 +17,18 @@ mxbai_rerank_hf_overrides = { } RERANK_MODELS = [ - LASTPoolingRerankModelInfo( + RerankModelInfo( "mixedbread-ai/mxbai-rerank-base-v2", architecture="Qwen2ForSequenceClassification", hf_overrides=mxbai_rerank_hf_overrides, mteb_score=0.273, + pooling_type="LAST", + attn_type="decoder", + is_prefix_caching_supported=True, + is_chunked_prefill_supported=True, enable_test=True, ), - LASTPoolingRerankModelInfo( + RerankModelInfo( "mixedbread-ai/mxbai-rerank-large-v2", architecture="Qwen2ForSequenceClassification", hf_overrides=mxbai_rerank_hf_overrides, diff --git a/tests/models/language/pooling_mteb_test/test_nemotron.py b/tests/models/language/pooling_mteb_test/test_nemotron.py index 167c3fcf50d1c..c91616c9ec01e 100644 --- a/tests/models/language/pooling_mteb_test/test_nemotron.py +++ b/tests/models/language/pooling_mteb_test/test_nemotron.py @@ -3,29 +3,39 @@ import pytest +from tests.models.language.pooling_mteb_test.mteb_embed_utils import ( + mteb_test_embed_models, +) +from tests.models.language.pooling_mteb_test.mteb_score_utils import ( + mteb_test_rerank_models, +) from tests.models.utils import ( EmbedModelInfo, - LASTPoolingEmbedModelInfo, - LASTPoolingRerankModelInfo, RerankModelInfo, ) -from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models - EMBEDDING_MODELS = [ - LASTPoolingEmbedModelInfo( + EmbedModelInfo( "nvidia/llama-nemotron-embed-1b-v2", architecture="LlamaBidirectionalModel", mteb_score=0.689164662128673, + pooling_type="MEAN", + attn_type="encoder_only", + is_prefix_caching_supported=False, + is_chunked_prefill_supported=False, ) ] RERANK_MODELS = [ - LASTPoolingRerankModelInfo( + RerankModelInfo( "nvidia/llama-nemotron-rerank-1b-v2", architecture="LlamaBidirectionalForSequenceClassification", chat_template_name="nemotron-rerank.jinja", mteb_score=0.33994, + pooling_type="MEAN", + attn_type="encoder_only", + is_prefix_caching_supported=False, + is_chunked_prefill_supported=False, ), ] diff --git a/tests/models/language/pooling_mteb_test/test_nomic.py b/tests/models/language/pooling_mteb_test/test_nomic.py index c54a43052483a..06c568026a75a 100644 --- a/tests/models/language/pooling_mteb_test/test_nomic.py +++ b/tests/models/language/pooling_mteb_test/test_nomic.py @@ -4,30 +4,38 @@ import pytest from tests.models.language.pooling.embed_utils import correctness_test_embed_models -from tests.models.utils import CLSPoolingEmbedModelInfo, EmbedModelInfo +from tests.models.utils import EmbedModelInfo -from .mteb_utils import mteb_test_embed_models +from .mteb_embed_utils import mteb_test_embed_models MODELS = [ - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "nomic-ai/nomic-embed-text-v1", architecture="NomicBertModel", mteb_score=0.737568559, enable_test=True, + pooling_type="MEAN", + attn_type="encoder_only", + is_prefix_caching_supported=False, + is_chunked_prefill_supported=False, ), - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "nomic-ai/nomic-embed-text-v1.5", architecture="NomicBertModel", enable_test=False, ), - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "nomic-ai/CodeRankEmbed", architecture="NomicBertModel", enable_test=False ), - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "nomic-ai/nomic-embed-text-v2-moe", architecture="NomicBertModel", mteb_score=0.715488912, enable_test=True, + pooling_type="MEAN", + attn_type="encoder_only", + is_prefix_caching_supported=False, + is_chunked_prefill_supported=False, ), ] diff --git a/tests/models/language/pooling_mteb_test/test_qwen3_reranker.py b/tests/models/language/pooling_mteb_test/test_qwen3_reranker.py index 9a1be6c0be1d6..a8e79c8391072 100644 --- a/tests/models/language/pooling_mteb_test/test_qwen3_reranker.py +++ b/tests/models/language/pooling_mteb_test/test_qwen3_reranker.py @@ -6,10 +6,10 @@ import pytest import torch from tests.conftest import HfRunner -from tests.models.utils import LASTPoolingRerankModelInfo, RerankModelInfo +from tests.models.utils import RerankModelInfo from tests.utils import multi_gpu_test -from .mteb_utils import mteb_test_rerank_models +from .mteb_score_utils import mteb_test_rerank_models qwen3_reranker_hf_overrides = { "architectures": ["Qwen3ForSequenceClassification"], @@ -18,14 +18,18 @@ qwen3_reranker_hf_overrides = { } RERANK_MODELS = [ - LASTPoolingRerankModelInfo( + RerankModelInfo( "Qwen/Qwen3-Reranker-0.6B", architecture="Qwen3ForSequenceClassification", mteb_score=0.25736, hf_overrides=qwen3_reranker_hf_overrides, + pooling_type="LAST", + attn_type="decoder", + is_prefix_caching_supported=True, + is_chunked_prefill_supported=True, enable_test=True, ), - LASTPoolingRerankModelInfo( + RerankModelInfo( "Qwen/Qwen3-Reranker-4B", architecture="Qwen3ForSequenceClassification", hf_overrides=qwen3_reranker_hf_overrides, diff --git a/tests/models/language/pooling_mteb_test/test_snowflake_arctic_embed.py b/tests/models/language/pooling_mteb_test/test_snowflake_arctic_embed.py index 3c30628aeaa49..37597a7e9ebab 100644 --- a/tests/models/language/pooling_mteb_test/test_snowflake_arctic_embed.py +++ b/tests/models/language/pooling_mteb_test/test_snowflake_arctic_embed.py @@ -4,62 +4,82 @@ import pytest from tests.models.language.pooling.embed_utils import correctness_test_embed_models -from tests.models.utils import CLSPoolingEmbedModelInfo, EmbedModelInfo +from tests.models.utils import EmbedModelInfo -from .mteb_utils import mteb_test_embed_models +from .mteb_embed_utils import mteb_test_embed_models MODELS = [ - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "Snowflake/snowflake-arctic-embed-xs", is_matryoshka=False, architecture="BertModel", mteb_score=0.714927797, + pooling_type="CLS", + attn_type="encoder_only", + is_prefix_caching_supported=False, + is_chunked_prefill_supported=False, enable_test=True, ), - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "Snowflake/snowflake-arctic-embed-s", is_matryoshka=False, architecture="BertModel", enable_test=False, ), - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "Snowflake/snowflake-arctic-embed-m", is_matryoshka=False, architecture="BertModel", enable_test=False, ), - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "Snowflake/snowflake-arctic-embed-m-long", is_matryoshka=False, architecture="NomicBertModel", mteb_score=0.681146831, + pooling_type="CLS", + attn_type="encoder_only", + is_prefix_caching_supported=False, + is_chunked_prefill_supported=False, enable_test=True, ), - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "Snowflake/snowflake-arctic-embed-l", is_matryoshka=False, architecture="BertModel", enable_test=False, ), - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "Snowflake/snowflake-arctic-embed-m-v1.5", is_matryoshka=True, architecture="BertModel", mteb_score=0.649088363, + pooling_type="CLS", + attn_type="encoder_only", + is_prefix_caching_supported=False, + is_chunked_prefill_supported=False, enable_test=True, ), - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "Snowflake/snowflake-arctic-embed-l-v2.0", is_matryoshka=True, architecture="XLMRobertaModel", mteb_score=0.712258299, + pooling_type="CLS", + attn_type="encoder_only", + is_prefix_caching_supported=False, + is_chunked_prefill_supported=False, enable_test=True, ), - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "Snowflake/snowflake-arctic-embed-m-v2.0", is_matryoshka=True, architecture="GteModel", mteb_score=0.706622444, + pooling_type="CLS", + attn_type="encoder_only", + is_prefix_caching_supported=False, + is_chunked_prefill_supported=False, enable_test=True, ), ] diff --git a/tests/models/language/pooling_mteb_test/test_st_projector.py b/tests/models/language/pooling_mteb_test/test_st_projector.py index 74fe4b9bcc03f..c1fd61b8e2270 100644 --- a/tests/models/language/pooling_mteb_test/test_st_projector.py +++ b/tests/models/language/pooling_mteb_test/test_st_projector.py @@ -3,25 +3,31 @@ import pytest from tests.models.utils import ( - CLSPoolingEmbedModelInfo, EmbedModelInfo, - LASTPoolingEmbedModelInfo, ) -from .mteb_utils import mteb_test_embed_models +from .mteb_embed_utils import mteb_test_embed_models # ST models with projector (Dense) layers ST_PROJECTOR_MODELS = [ - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "TencentBAC/Conan-embedding-v1", architecture="BertModel", mteb_score=0.688611955, + pooling_type="MEAN", + attn_type="encoder_only", + is_prefix_caching_supported=False, + is_chunked_prefill_supported=False, enable_test=True, ), - LASTPoolingEmbedModelInfo( + EmbedModelInfo( "google/embeddinggemma-300m", architecture="Gemma3TextModel", mteb_score=0.7473819294684156, + pooling_type="MEAN", + attn_type="encoder_only", + is_prefix_caching_supported=False, + is_chunked_prefill_supported=False, enable_test=True, dtype="float32", ), diff --git a/tests/models/utils.py b/tests/models/utils.py index bf26c21fb5f58..12544bc96bb5a 100644 --- a/tests/models/utils.py +++ b/tests/models/utils.py @@ -10,7 +10,7 @@ import torch import torch.nn.functional as F from transformers import PretrainedConfig -from vllm.config.model import ModelConfig, ModelDType, RunnerOption +from vllm.config.model import AttnTypeStr, ModelConfig, ModelDType, RunnerOption from vllm.logprobs import Logprob, PromptLogprobs, SampleLogprobs from vllm.multimodal.processing import InputProcessingContext from vllm.tokenizers import cached_tokenizer_from_config @@ -375,7 +375,10 @@ class ModelInfo: max_model_len: int | None = None hf_dtype: str = "float32" hf_overrides: dict[str, Any] | None = None - default_pooling_type: str = "" + pooling_type: str | None = None + attn_type: AttnTypeStr | None = None + is_prefix_caching_supported: bool | None = None + is_chunked_prefill_supported: bool | None = None enable_test: bool = True @@ -386,32 +389,12 @@ class EmbedModelInfo(ModelInfo): matryoshka_dimensions: list[int] | None = None -@dataclass -class CLSPoolingEmbedModelInfo(EmbedModelInfo): - default_pooling_type: str = "CLS" - - -@dataclass -class LASTPoolingEmbedModelInfo(EmbedModelInfo): - default_pooling_type: str = "LAST" - - @dataclass class RerankModelInfo(ModelInfo): mteb_score: float | None = None chat_template_name: str | None = None -@dataclass -class CLSPoolingRerankModelInfo(RerankModelInfo): - default_pooling_type: str = "CLS" - - -@dataclass -class LASTPoolingRerankModelInfo(RerankModelInfo): - default_pooling_type: str = "LAST" - - @dataclass class GenerateModelInfo(ModelInfo): hf_dtype: str = "auto" From 66c98874401b1e246e6c649942156720da671acd Mon Sep 17 00:00:00 2001 From: Kevin McKay Date: Wed, 24 Dec 2025 09:37:11 -0600 Subject: [PATCH 7/8] [Bugfix][Hardware][AMD] Fix FP8 dtype in silu_mul quantization (#31179) Signed-off-by: c0de128 --- .../layers/quantization/utils/fp8_utils.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py index 15ea9f7d60fff..8e4dde324f397 100644 --- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py @@ -625,8 +625,9 @@ def silu_mul_per_token_group_quant_fp8_colmajor( M, N = input.size() N_2 = N // 2 + fp8_dtype = current_platform.fp8_dtype() if output is None: - output = torch.empty((M, N_2), dtype=torch.float8_e4m3fn, device=input.device) + output = torch.empty((M, N_2), dtype=fp8_dtype, device=input.device) output_scales = torch.empty( ((N_2 // GROUP_SIZE), M), dtype=torch.float32, device=input.device @@ -637,9 +638,12 @@ def silu_mul_per_token_group_quant_fp8_colmajor( assert M % BLOCK_M == 0 assert N_2 % BLOCK_N == 0 - finfo = torch.finfo(torch.float8_e4m3fn) - fp8_min = finfo.min - fp8_max = finfo.max + # Using the default value (240.0) from pytorch will cause accuracy + # issue on dynamic quantization models. Here use 224.0 for fnuz on ROCm + # platforms that use the torch.float8_e4m3fnuz dtype. + finfo = torch.finfo(fp8_dtype) + fp8_min = -224.0 if current_platform.is_fp8_fnuz() else finfo.min + fp8_max = 224.0 if current_platform.is_fp8_fnuz() else finfo.max # Force even division so we can avoid edgecases within the kernel. assert M % BLOCK_M == 0 From 5d9308968649c81ee5903fc2a77377d738ed2f6d Mon Sep 17 00:00:00 2001 From: Ning Xie Date: Wed, 24 Dec 2025 23:45:47 +0800 Subject: [PATCH 8/8] [cli] complete vllm cli help message (#31226) Signed-off-by: Andy Xie --- vllm/entrypoints/cli/benchmark/main.py | 1 + vllm/entrypoints/cli/serve.py | 6 +++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/vllm/entrypoints/cli/benchmark/main.py b/vllm/entrypoints/cli/benchmark/main.py index 2ff98577c3634..48f34fce1d44c 100644 --- a/vllm/entrypoints/cli/benchmark/main.py +++ b/vllm/entrypoints/cli/benchmark/main.py @@ -32,6 +32,7 @@ class BenchmarkSubcommand(CLISubcommand): ) -> FlexibleArgumentParser: bench_parser = subparsers.add_parser( self.name, + help=self.help, description=self.help, usage=f"vllm {self.name} [options]", ) diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py index 96608f360e17b..77c7253aef06e 100644 --- a/vllm/entrypoints/cli/serve.py +++ b/vllm/entrypoints/cli/serve.py @@ -66,7 +66,11 @@ class ServeSubcommand(CLISubcommand): self, subparsers: argparse._SubParsersAction ) -> FlexibleArgumentParser: serve_parser = subparsers.add_parser( - self.name, description=DESCRIPTION, usage="vllm serve [model_tag] [options]" + self.name, + help="Launch a local OpenAI-compatible API server to serve LLM " + "completions via HTTP.", + description=DESCRIPTION, + usage="vllm serve [model_tag] [options]", ) serve_parser = make_arg_parser(serve_parser)