diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index 9885309035e6c..9e28b0c443df4 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -418,7 +418,7 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
-        config = vllm_config.model_config.hf_config
+        config: UltravoxConfig = vllm_config.model_config.hf_config
         multimodal_config = vllm_config.model_config.multimodal_config
         self.config = config
         self.multi_modal_config = multimodal_config
@@ -438,7 +438,7 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
         self.multi_modal_projector = UltravoxProjector(config)
         self.language_model = init_vllm_registered_model(
             vllm_config=vllm_config,
-            hf_config=config.text_config,
+            hf_config=config.wrapped_model_config,
             prefix=maybe_prefix(prefix, "language_model"),
         )
         if config.text_model_id is not None:
diff --git a/vllm/transformers_utils/configs/ultravox.py b/vllm/transformers_utils/configs/ultravox.py
index 71266b9327369..e67479516560f 100644
--- a/vllm/transformers_utils/configs/ultravox.py
+++ b/vllm/transformers_utils/configs/ultravox.py
@@ -46,7 +46,7 @@ class UltravoxConfig(transformers.PretrainedConfig):
             projector or at the end. Versions v0.4.1 and below
             use `False`, but v0.5 and above use `True`.
     """
-
+    wrapped_model_config: transformers.PretrainedConfig
     model_type = "ultravox"
     audio_token = "<|audio|>"
     is_composition = False
@@ -113,9 +113,8 @@ class UltravoxConfig(transformers.PretrainedConfig):
         return super().__setattr__(key, value)
 
     @property
-    def text_config(self) -> Optional[transformers.PretrainedConfig]:
+    def text_config(self) -> transformers.PretrainedConfig:
         # When Ultravox wraps a multi-modal model (e.g. Gemma), we instantiate
         # the full model, but the text config is the text config of the inner
         # model.
-        return (self.wrapped_model_config.get_text_config()
-                if self.wrapped_model_config else None)
+        return self.wrapped_model_config.get_text_config()