Move PoolerConfig from config/__init__.py to config/pooler.py (#25181)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2026-07-18 09:37:10 +08:00 · 2025-09-19 12:02:55 +01:00 · 2025-09-19 12:02:55 +01:00 · 058525b997
commit 058525b997
parent 1dfea5f4a9
14 changed files with 193 additions and 179 deletions
--- a/docs/models/pooling_models.md
+++ b/docs/models/pooling_models.md
@ -59,7 +59,7 @@ enabling the corresponding APIs:
 #### Predefined models
 If the [Pooler][vllm.model_executor.layers.pooler.Pooler] defined by the model accepts `pooler_config`,
-you can override some of its attributes via the `--override-pooler-config` option.
+you can override some of its attributes via the `--pooler-config` option.
 #### Converted models
@ -75,7 +75,7 @@ the pooler assigned to each task has the following attributes by default:
 When loading [Sentence Transformers](https://huggingface.co/sentence-transformers) models,
 its Sentence Transformers configuration file (`modules.json`) takes priority over the model's defaults.
-You can further customize this via the `--override-pooler-config` option,
+You can further customize this via the `--pooler-config` option,
 which takes priority over both the model's and Sentence Transformers's defaults.
 ## Offline Inference
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@ -457,7 +457,7 @@ These models primarily support the [`LLM.embed`](./pooling_models.md#llmembed) A
 !!! note
    `ssmits/Qwen2-7B-Instruct-embed-base` has an improperly defined Sentence Transformers config.
-    You need to manually set mean pooling by passing `--override-pooler-config '{"pooling_type": "MEAN"}'`.
+    You need to manually set mean pooling by passing `--pooler-config '{"pooling_type": "MEAN"}'`.
 !!! note
    For `Alibaba-NLP/gte-Qwen2-*`, you need to enable `--trust-remote-code` for the correct tokenizer to be loaded.
@ -552,7 +552,7 @@ If your model is not in the above list, we will try to automatically convert the
 !!! important
    For process-supervised reward models such as `peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly,
-    e.g.: `--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`.
+    e.g.: `--pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`.
 #### Token Classification
--- a/examples/online_serving/openai_embedding_long_text/README.md
+++ b/examples/online_serving/openai_embedding_long_text/README.md
@ -42,7 +42,7 @@ python client.py
 ### Server Configuration
-The key parameters for chunked processing are in the `--override-pooler-config`:
+The key parameters for chunked processing are in the `--pooler-config`:
 ```json
 {
--- a/examples/online_serving/openai_embedding_long_text/client.py
+++ b/examples/online_serving/openai_embedding_long_text/client.py
@ -13,7 +13,7 @@ Prerequisites:
   # MEAN pooling (processes all chunks, recommended for complete coverage)
   vllm serve intfloat/multilingual-e5-large \
-     --override-pooler-config \
+     --pooler-config \
      '{"pooling_type": "MEAN", "normalize": true, ' \
      '"enable_chunked_processing": true, "max_embed_len": 3072000}' \
     --served-model-name multilingual-e5-large \
@ -23,7 +23,7 @@ Prerequisites:
   # OR CLS pooling (native CLS within chunks, MEAN aggregation across chunks)
   vllm serve BAAI/bge-large-en-v1.5 \
-     --override-pooler-config \
+     --pooler-config \
      '{"pooling_type": "CLS", "normalize": true, ' \
      '"enable_chunked_processing": true, "max_embed_len": 1048576}' \
     --served-model-name bge-large-en-v1.5 \
--- a/examples/online_serving/openai_embedding_long_text/service.sh
+++ b/examples/online_serving/openai_embedding_long_text/service.sh
@ -103,7 +103,7 @@ POOLER_CONFIG="{\"pooling_type\": \"$POOLING_TYPE\", \"normalize\": true, \"enab
 vllm serve "$MODEL_NAME" \
  --tensor-parallel-size "$GPU_COUNT" \
  --enforce-eager \
-  --override-pooler-config "$POOLER_CONFIG" \
+  --pooler-config "$POOLER_CONFIG" \
  --served-model-name ${MODEL_CODE} \
  --api-key "$API_KEY" \
  --trust-remote-code \
--- a/tests/entrypoints/pooling/openai/test_embedding_long_text.py
+++ b/tests/entrypoints/pooling/openai/test_embedding_long_text.py
@ -216,7 +216,7 @@ def server_with_chunked_processing():
        "--enforce-eager",
        "--max-model-len",
        "512",  # Set smaller max_model_len to trigger chunking mechanism
-        '--override-pooler-config',
+        '--pooler-config',
        ('{"pooling_type": "MEAN", "normalize": true, '
         '"enable_chunked_processing": true, "max_embed_len": 10000}'),
        "--gpu-memory-utilization",
--- a/tests/models/language/pooling/test_embedding.py
+++ b/tests/models/language/pooling/test_embedding.py
@ -58,7 +58,7 @@ def test_models(
    vllm_extra_kwargs = {}
    if model == "ssmits/Qwen2-7B-Instruct-embed-base":
-        vllm_extra_kwargs["override_pooler_config"] = \
+        vllm_extra_kwargs["pooler_config"] = \
            PoolerConfig(pooling_type="MEAN", normalize=False)
    max_model_len: Optional[int] = 512
--- a/tests/models/language/pooling/test_mm_classifier_conversion.py
+++ b/tests/models/language/pooling/test_mm_classifier_conversion.py
@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from vllm.config.pooler import PoolerConfig
 from vllm.platforms import current_platform
@ -99,7 +100,7 @@ def test_gemma_multimodal(
                     convert="classify",
                     load_format="auto",
                     hf_overrides=update_config,
-                     override_pooler_config={"pooling_type": "LAST"},
+                     pooler_config=PoolerConfig(pooling_type="LAST"),
                     max_model_len=512,
                     enforce_eager=True,
                     tensor_parallel_size=1,
--- a/tests/models/language/pooling/test_pooler_config_init_behaviour.py
+++ b/tests/models/language/pooling/test_pooler_config_init_behaviour.py
@ -24,18 +24,18 @@ def test_classify_models_using_activation(
    dtype: str,
 ) -> None:
-    with vllm_runner(model,
+    with vllm_runner(
-                     max_model_len=512,
+            model,
-                     dtype=dtype,
+            max_model_len=512,
-                     override_pooler_config=PoolerConfig(
+            dtype=dtype,
-                         activation=False)) as vllm_model:
+            pooler_config=PoolerConfig(activation=False)) as vllm_model:
        wo_activation_out = vllm_model.classify(example_prompts)
-    with vllm_runner(model,
+    with vllm_runner(
-                     max_model_len=512,
+            model,
-                     dtype=dtype,
+            max_model_len=512,
-                     override_pooler_config=PoolerConfig(
+            dtype=dtype,
-                         activation=True)) as vllm_model:
+            pooler_config=PoolerConfig(activation=True)) as vllm_model:
        w_activation_out = vllm_model.classify(example_prompts)
    for wo_activation, w_activation in zip(wo_activation_out,
@ -43,9 +43,8 @@ def test_classify_models_using_activation(
        wo_activation = torch.tensor(wo_activation)
        w_activation = torch.tensor(w_activation)
-        assert not torch.allclose(
+        assert not torch.allclose(wo_activation, w_activation,
-            wo_activation, w_activation,
+                                  atol=1e-2), "pooler_config is not working"
            atol=1e-2), "override_pooler_config is not working"
        assert torch.allclose(softmax(wo_activation), w_activation,
                              1e-3 if dtype == "float" else 1e-2)
@ -65,23 +64,22 @@ def test_embed_models_using_normalize(
    dtype: str,
 ) -> None:
    with vllm_runner(model,
                     max_model_len=512,
                     dtype=dtype,
                     override_pooler_config=PoolerConfig(
                         normalize=False)) as vllm_model:
        wo_normalize = torch.tensor(vllm_model.embed(example_prompts))
    with vllm_runner(
            model,
            max_model_len=512,
            dtype=dtype,
-            override_pooler_config=PoolerConfig(normalize=True)) as vllm_model:
+            pooler_config=PoolerConfig(normalize=False)) as vllm_model:
        wo_normalize = torch.tensor(vllm_model.embed(example_prompts))
    with vllm_runner(model,
                     max_model_len=512,
                     dtype=dtype,
                     pooler_config=PoolerConfig(normalize=True)) as vllm_model:
        w_normalize = torch.tensor(vllm_model.embed(example_prompts))
    assert not torch.allclose(
        wo_normalize, w_normalize,
-        atol=1e-2), "override_pooler_config normalize is not working"
+        atol=1e-2), "pooler_config normalize is not working"
    assert torch.allclose(
        F.normalize(wo_normalize, p=2, dim=-1), w_normalize,
        atol=1e-2), "w_normal should be close to normal(wo_normal)."
@ -102,18 +100,16 @@ def test_reward_models_using_softmax(
    dtype: str,
 ) -> None:
-    with vllm_runner(
+    with vllm_runner(model,
-            model,
+                     max_model_len=1024,
-            max_model_len=1024,
+                     dtype=dtype,
-            dtype=dtype,
+                     pooler_config=PoolerConfig(softmax=False)) as vllm_model:
            override_pooler_config=PoolerConfig(softmax=False)) as vllm_model:
        wo_softmax = vllm_model.encode(example_prompts)
-    with vllm_runner(
+    with vllm_runner(model,
-            model,
+                     max_model_len=1024,
-            max_model_len=1024,
+                     dtype=dtype,
-            dtype=dtype,
+                     pooler_config=PoolerConfig(softmax=True)) as vllm_model:
            override_pooler_config=PoolerConfig(softmax=True)) as vllm_model:
        w_softmax = vllm_model.encode(example_prompts)
    for wo, w in zip(wo_softmax, w_softmax):
@ -121,7 +117,7 @@ def test_reward_models_using_softmax(
        w = torch.tensor(w)
        assert not torch.allclose(
-            wo, w, atol=1e-2), "override_pooler_config softmax is not working"
+            wo, w, atol=1e-2), "pooler_config softmax is not working"
        assert torch.allclose(
            softmax(wo), w,
            atol=1e-2), "w_softmax should be close to softmax(wo_softmax)."
--- a/tests/test_config.py
+++ b/tests/test_config.py
@ -207,25 +207,19 @@ def test_get_pooling_config():
    model_id = "sentence-transformers/all-MiniLM-L12-v2"
    model_config = ModelConfig(model_id)
-    pooling_config = model_config._init_pooler_config()
+    assert model_config.pooler_config is not None
-    assert pooling_config is not None
+    assert model_config.pooler_config.normalize
-
+    assert model_config.pooler_config.pooling_type == PoolingType.MEAN.name
    assert pooling_config.normalize
    assert pooling_config.pooling_type == PoolingType.MEAN.name
@pytest.mark.skipif(current_platform.is_rocm(),
                    reason="Xformers backend is not supported on ROCm.")
 def test_get_pooling_config_from_args():
    model_id = "sentence-transformers/all-MiniLM-L12-v2"
-    model_config = ModelConfig(model_id)
+    pooler_config = PoolerConfig(pooling_type="CLS", normalize=True)
    model_config = ModelConfig(model_id, pooler_config=pooler_config)
-    override_pooler_config = PoolerConfig(pooling_type='CLS', normalize=True)
+    assert asdict(model_config.pooler_config) == asdict(pooler_config)
    model_config.override_pooler_config = override_pooler_config
    pooling_config = model_config._init_pooler_config()
    assert pooling_config is not None
    assert asdict(pooling_config) == asdict(override_pooler_config)
@pytest.mark.parametrize(
--- a/vllm/config/init.py
+++ b/vllm/config/init.py
@ -40,6 +40,7 @@ from vllm.config.multimodal import (MMCacheType, MMEncoderTPMode,
                                    MultiModalConfig)
 from vllm.config.parallel import (DistributedExecutorBackend, EPLBConfig,
                                  ParallelConfig)
 from vllm.config.pooler import PoolerConfig
 from vllm.config.scheduler import SchedulerConfig, SchedulerPolicy
 from vllm.config.speculative import SpeculativeConfig
 from vllm.config.structured_outputs import StructuredOutputsConfig
@ -406,13 +407,6 @@ class ModelConfig:
    hf_overrides: HfOverrides = field(default_factory=dict)
    """If a dictionary, contains arguments to be forwarded to the Hugging Face
    config. If a callable, it is called to update the HuggingFace config."""
    pooler_config: Optional["PoolerConfig"] = field(init=False)
    """Pooler config which controls the behaviour of output pooling in pooling
    models."""
    override_pooler_config: Optional[Union[dict, "PoolerConfig"]] = None
    """Initialize non-default pooling config or override default pooling config
    for the pooling model. e.g. `{"pooling_type": "mean", "normalize": false}`.
    """
    logits_processor_pattern: Optional[str] = None
    """Optional regex pattern specifying valid logits processor qualified names
    that can be passed with the `logits_processors` extra completion argument.
@ -448,6 +442,14 @@ class ModelConfig:
    io_processor_plugin: Optional[str] = None
    """IOProcessor plugin name to load at model startup"""
    # Pooler config
    pooler_config: Optional[PoolerConfig] = None
    """Pooler config which controls the behaviour of output pooling in pooling
    models."""
    override_pooler_config: Optional[Union[dict, PoolerConfig]] = None
    """[DEPRECATED] Use `pooler_config` instead. This field will be removed in
    v0.12.0 or v1.0.0, whichever is sooner."""
    # Multimodal config and init vars
    multimodal_config: Optional[MultiModalConfig] = None
    """Configuration for multimodal model. If `None`, this will be inferred
@ -709,7 +711,33 @@ class ModelConfig:
        self._architecture = arch
        logger.info("Resolved architecture: %s", arch)
-        self.pooler_config = self._init_pooler_config()
+        # Init pooler config if needed
        if self.runner_type == "pooling":
            if self.override_pooler_config is not None:
                logger.warning_once(
                    "`override_pooler_config` is deprecated and will be "
                    "removed in v0.12.0 or v1.0.0, whichever is sooner. "
                    "Please use `pooler_config` instead.")
                if isinstance(self.override_pooler_config, dict):
                    self.pooler_config = PoolerConfig(
                        **self.override_pooler_config)
                else:
                    self.pooler_config = self.override_pooler_config
            if self.pooler_config is None:
                self.pooler_config = PoolerConfig()
            base_config = get_pooling_config(self.model, self.revision)
            if base_config is not None:
                # Only set values that are not overridden by the user
                for k, v in base_config.items():
                    if getattr(self.pooler_config, k) is None:
                        setattr(self.pooler_config, k, v)
            default_pooling_type = self._model_info.default_pooling_type
            if self.pooler_config.pooling_type is None:
                self.pooler_config.pooling_type = default_pooling_type
        self.dtype: torch.dtype = _get_and_verify_dtype(
            self.model,
@ -869,29 +897,6 @@ class ModelConfig:
        return get_sentence_transformer_tokenizer_config(
            self.model, self.revision)
    def _init_pooler_config(self) -> Optional["PoolerConfig"]:
        if self.runner_type == "pooling":
            if isinstance(self.override_pooler_config, dict):
                self.override_pooler_config = PoolerConfig(
                    **self.override_pooler_config)
            pooler_config = self.override_pooler_config or PoolerConfig()
            base_config = get_pooling_config(self.model, self.revision)
            if base_config is not None:
                # Only set values that are not overridden by the user
                for k, v in base_config.items():
                    if getattr(pooler_config, k) is None:
                        setattr(pooler_config, k, v)
            default_pooling_type = self._model_info.default_pooling_type
            if pooler_config.pooling_type is None:
                pooler_config.pooling_type = default_pooling_type
            return pooler_config
        return None
    def _verify_tokenizer_mode(self) -> None:
        tokenizer_mode = cast(TokenizerMode, self.tokenizer_mode.lower())
        if tokenizer_mode not in get_args(TokenizerMode):
@ -1833,94 +1838,6 @@ class DeviceConfig:
            self.device = torch.device(self.device_type)
@config
@dataclass
 class PoolerConfig:
    """Controls the behavior of output pooling in pooling models."""
    pooling_type: Optional[str] = None
    """
    The pooling method of the pooling model. This should be a key in
    [`vllm.model_executor.layers.pooler.PoolingType`][].
    """
    ## for embeddings models
    normalize: Optional[bool] = None
    """
    Whether to normalize the embeddings outputs. Defaults to True.
    """
    dimensions: Optional[int] = None
    """
    Reduce the dimensions of embeddings if model
    support matryoshka representation. Defaults to None.
    """
    enable_chunked_processing: Optional[bool] = None
    """
    Whether to enable chunked processing for long inputs that exceed the model's
    maximum position embeddings. When enabled, long inputs will be split into
    chunks, processed separately, and then aggregated using weighted averaging.
    This allows embedding models to handle arbitrarily long text without CUDA
    errors. Defaults to False.
    """
    max_embed_len: Optional[int] = None
    """
    Maximum input length allowed for embedding generation. When set, allows
    inputs longer than max_embed_len to be accepted for embedding models.
    When an input exceeds max_embed_len, it will be handled according to 
    the original max_model_len validation logic. 
    Defaults to None (i.e. set to max_model_len).
    """
    ## for classification models
    activation: Optional[bool] = None
    """
    Whether to apply activation function to the classification outputs.
    Defaults to True.
    """
    logit_bias: Optional[float] = None
    """
    If provided, apply classification logit biases. Defaults to None.
    """
    ## for reward models
    softmax: Optional[bool] = None
    """
    Whether to apply softmax to the reward outputs.
    Defaults to True.
    """
    step_tag_id: Optional[int] = None
    """
    If set, only the score corresponding to the ``step_tag_id`` in the
    generated sentence should be returned. Otherwise, the scores for all tokens
    are returned.
    """
    returned_token_ids: Optional[list[int]] = None
    """
    A list of indices for the vocabulary dimensions to be extracted,
    such as the token IDs of ``good_token`` and ``bad_token`` in the
    ``math-shepherd-mistral-7b-prm`` model.
    """
    def compute_hash(self) -> str:
        """
        WARNING: Whenever a new field is added to this config,
        ensure that it is included in the factors list if
        it affects the computation graph.
        Provide a hash that uniquely identifies all the configs
        that affect the structure of the computation
        graph from input ids/embeddings to the final hidden states,
        excluding anything before input ids/embeddings and after
        the final hidden states.
        """
        # no factors to consider.
        # this config will not affect the computation graph.
        factors: list[Any] = []
        hash_str = hashlib.md5(str(factors).encode(),
                               usedforsecurity=False).hexdigest()
        return hash_str
 _STR_DTYPE_TO_TORCH_DTYPE = {
    "half": torch.float16,
    "float16": torch.float16,
--- a/vllm/config/pooler.py
+++ b/vllm/config/pooler.py
@ -0,0 +1,97 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import hashlib
 from typing import Any, Optional
 from pydantic.dataclasses import dataclass
 from vllm.config.utils import config
@config
@dataclass
 class PoolerConfig:
    """Controls the behavior of output pooling in pooling models."""
    pooling_type: Optional[str] = None
    """
    The pooling method of the pooling model. This should be a key in
    [`vllm.model_executor.layers.pooler.PoolingType`][].
    """
    ## for embeddings models
    normalize: Optional[bool] = None
    """
    Whether to normalize the embeddings outputs. Defaults to True.
    """
    dimensions: Optional[int] = None
    """
    Reduce the dimensions of embeddings if model
    support matryoshka representation. Defaults to None.
    """
    enable_chunked_processing: Optional[bool] = None
    """
    Whether to enable chunked processing for long inputs that exceed the model's
    maximum position embeddings. When enabled, long inputs will be split into
    chunks, processed separately, and then aggregated using weighted averaging.
    This allows embedding models to handle arbitrarily long text without CUDA
    errors. Defaults to False.
    """
    max_embed_len: Optional[int] = None
    """
    Maximum input length allowed for embedding generation. When set, allows
    inputs longer than max_embed_len to be accepted for embedding models.
    When an input exceeds max_embed_len, it will be handled according to 
    the original max_model_len validation logic. 
    Defaults to None (i.e. set to max_model_len).
    """
    ## for classification models
    activation: Optional[bool] = None
    """
    Whether to apply activation function to the classification outputs.
    Defaults to True.
    """
    logit_bias: Optional[float] = None
    """
    If provided, apply classification logit biases. Defaults to None.
    """
    ## for reward models
    softmax: Optional[bool] = None
    """
    Whether to apply softmax to the reward outputs.
    Defaults to True.
    """
    step_tag_id: Optional[int] = None
    """
    If set, only the score corresponding to the ``step_tag_id`` in the
    generated sentence should be returned. Otherwise, the scores for all tokens
    are returned.
    """
    returned_token_ids: Optional[list[int]] = None
    """
    A list of indices for the vocabulary dimensions to be extracted,
    such as the token IDs of ``good_token`` and ``bad_token`` in the
    ``math-shepherd-mistral-7b-prm`` model.
    """
    def compute_hash(self) -> str:
        """
        WARNING: Whenever a new field is added to this config,
        ensure that it is included in the factors list if
        it affects the computation graph.
        Provide a hash that uniquely identifies all the configs
        that affect the structure of the computation
        graph from input ids/embeddings to the final hidden states,
        excluding anything before input ids/embeddings and after
        the final hidden states.
        """
        # no factors to consider.
        # this config will not affect the computation graph.
        factors: list[Any] = []
        hash_str = hashlib.md5(str(factors).encode(),
                               usedforsecurity=False).hexdigest()
        return hash_str
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@ -441,6 +441,7 @@ class EngineArgs:
    scheduling_policy: SchedulerPolicy = SchedulerConfig.policy
    scheduler_cls: Union[str, Type[object]] = SchedulerConfig.scheduler_cls
    pooler_config: Optional[PoolerConfig] = ModelConfig.pooler_config
    override_pooler_config: Optional[Union[dict, PoolerConfig]] = \
        ModelConfig.override_pooler_config
    compilation_config: CompilationConfig = \
@ -579,8 +580,11 @@ class EngineArgs:
                                 help=model_kwargs["hf_token"]["help"])
        model_group.add_argument("--hf-overrides",
                                 **model_kwargs["hf_overrides"])
        model_group.add_argument("--pooler-config",
                                 **model_kwargs["pooler_config"])
        model_group.add_argument("--override-pooler-config",
-                                 **model_kwargs["override_pooler_config"])
+                                 **model_kwargs["override_pooler_config"],
                                 deprecated=True)
        model_group.add_argument("--logits-processor-pattern",
                                 **model_kwargs["logits_processor_pattern"])
        model_group.add_argument("--generation-config",
@ -1031,6 +1035,7 @@ class EngineArgs:
            mm_shm_cache_max_object_size_mb=self.
            mm_shm_cache_max_object_size_mb,
            mm_encoder_tp_mode=self.mm_encoder_tp_mode,
            pooler_config=self.pooler_config,
            override_pooler_config=self.override_pooler_config,
            logits_processor_pattern=self.logits_processor_pattern,
            generation_config=self.generation_config,
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@ -151,9 +151,11 @@ class LLM:
            multi-modal processor obtained from `AutoProcessor.from_pretrained`.
            The available overrides depend on the model that is being run.
            For example, for Phi-3-Vision: `{"num_crops": 4}`.
-        override_pooler_config: Initialize non-default pooling config or
+        pooler_config: Initialize non-default pooling config for the pooling
-            override default pooling config for the pooling model.
+            model. e.g. `PoolerConfig(pooling_type="mean", normalize=False)`.
-            e.g. `PoolerConfig(pooling_type="mean", normalize=False)`.
+        override_pooler_config: [DEPRECATED] Use `pooler_config` instead. This
            argument is deprecated and will be removed in v0.12.0 or v1.0.0,
            whichever is sooner.
        compilation_config: Either an integer or a dictionary. If it is an
            integer, it is used as the level of compilation optimization. If it
            is a dictionary, it can specify the full compilation configuration.
@ -191,6 +193,7 @@ class LLM:
        hf_token: Optional[Union[bool, str]] = None,
        hf_overrides: Optional[HfOverrides] = None,
        mm_processor_kwargs: Optional[dict[str, Any]] = None,
        pooler_config: Optional[PoolerConfig] = None,
        override_pooler_config: Optional[PoolerConfig] = None,
        structured_outputs_config: Optional[Union[dict[
            str, Any], StructuredOutputsConfig]] = None,
@ -288,6 +291,7 @@ class LLM:
            hf_token=hf_token,
            hf_overrides=hf_overrides,
            mm_processor_kwargs=mm_processor_kwargs,
            pooler_config=pooler_config,
            override_pooler_config=override_pooler_config,
            structured_outputs_config=structured_outputs_instance,
            compilation_config=compilation_config_instance,