mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-09 15:44:57 +08:00
Use Transformers helper get_text_config() instead of checking for text_config (#17105)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
parent
0bd7f8fca5
commit
423e9f1cbe
@ -553,9 +553,8 @@ def main(args: argparse.Namespace):
|
||||
intermediate_size = config.moe_intermediate_size
|
||||
shard_intermediate_size = 2 * intermediate_size // args.tp_size
|
||||
else:
|
||||
if not hasattr(config, "hidden_size"):
|
||||
# Support for llama4
|
||||
config = config.text_config
|
||||
# Support for llama4
|
||||
config = config.get_text_config()
|
||||
# Default: Mixtral.
|
||||
E = config.num_local_experts
|
||||
topk = config.num_experts_per_tok
|
||||
|
||||
@ -24,10 +24,7 @@ def test_can_initialize(model_arch):
|
||||
def hf_overrides(hf_config: PretrainedConfig) -> PretrainedConfig:
|
||||
hf_config.update(model_info.hf_overrides)
|
||||
|
||||
if hasattr(hf_config, "text_config"):
|
||||
text_config: PretrainedConfig = hf_config.text_config
|
||||
else:
|
||||
text_config = hf_config
|
||||
text_config = hf_config.get_text_config()
|
||||
|
||||
text_config.update({
|
||||
"num_layers": 1,
|
||||
|
||||
@ -2841,12 +2841,10 @@ def _get_and_verify_dtype(
|
||||
) -> torch.dtype:
|
||||
# NOTE: getattr(config, "torch_dtype", torch.float32) is not correct
|
||||
# because config.torch_dtype can be None.
|
||||
config_dtype = getattr(config, "torch_dtype", None)
|
||||
config_dtype = getattr(config.get_text_config(), "torch_dtype", None)
|
||||
|
||||
# Fallbacks for multi-modal models if the root config
|
||||
# Fallback for multi-modal models if the root config
|
||||
# does not define torch_dtype
|
||||
if config_dtype is None and hasattr(config, "text_config"):
|
||||
config_dtype = getattr(config.text_config, "torch_dtype", None)
|
||||
if config_dtype is None and hasattr(config, "vision_config"):
|
||||
config_dtype = getattr(config.vision_config, "torch_dtype", None)
|
||||
|
||||
|
||||
@ -760,19 +760,22 @@ def get_hf_text_config(config: PretrainedConfig):
|
||||
"""Get the "sub" config relevant to llm for multi modal models.
|
||||
No op for pure text models.
|
||||
"""
|
||||
if hasattr(config, "text_config"):
|
||||
# The code operates under the assumption that text_config should have
|
||||
# `num_attention_heads` (among others). Assert here to fail early
|
||||
# if transformers config doesn't align with this assumption.
|
||||
assert hasattr(config.text_config, "num_attention_heads")
|
||||
return config.text_config
|
||||
elif hasattr(config, "thinker_config"):
|
||||
# This block should be unnecessary after https://github.com/huggingface/transformers/pull/37517
|
||||
if hasattr(config, "thinker_config"):
|
||||
# TODO(suyang.fy): Refactor code.
|
||||
# For Qwen2.5-Omni, change hf_text_config to
|
||||
# thinker_config.text_config.
|
||||
return config.thinker_config.text_config
|
||||
else:
|
||||
return config
|
||||
|
||||
text_config = config.get_text_config()
|
||||
|
||||
if text_config is not config:
|
||||
# The code operates under the assumption that text_config should have
|
||||
# `num_attention_heads` (among others). Assert here to fail early
|
||||
# if transformers config doesn't align with this assumption.
|
||||
assert hasattr(text_config, "num_attention_heads")
|
||||
|
||||
return text_config
|
||||
|
||||
|
||||
def try_get_generation_config(
|
||||
|
||||
@ -508,13 +508,8 @@ class CPUModelRunnerBase(ModelRunnerBase[TModelInputForCPU]):
|
||||
logger.warning("Regarding multimodal models, vLLM currently "
|
||||
"only supports adding LoRA to language model.")
|
||||
|
||||
# It's necessary to distinguish between the max_position_embeddings
|
||||
# of VLMs and LLMs.
|
||||
if hasattr(self.model.config, "max_position_embeddings"):
|
||||
max_pos_embeddings = self.model.config.max_position_embeddings
|
||||
else:
|
||||
max_pos_embeddings = (
|
||||
self.model.config.text_config.max_position_embeddings)
|
||||
# Use get_text_config() in case of multimodal models
|
||||
text_config = self.model_config.hf_config.get_text_config()
|
||||
|
||||
self.lora_manager = LRUCacheWorkerLoRAManager(
|
||||
self.scheduler_config.max_num_seqs,
|
||||
@ -524,7 +519,7 @@ class CPUModelRunnerBase(ModelRunnerBase[TModelInputForCPU]):
|
||||
self.device,
|
||||
self.model.embedding_modules,
|
||||
self.model.embedding_padding_modules,
|
||||
max_position_embeddings=max_pos_embeddings,
|
||||
max_position_embeddings=text_config.max_position_embeddings,
|
||||
)
|
||||
self.model = self.lora_manager.create_lora_manager(self.model)
|
||||
|
||||
|
||||
@ -724,14 +724,9 @@ class HPUModelRunnerBase(ModelRunnerBase[TModelInputForHPU]):
|
||||
"Bias support in LoRA is not enabled in HPU yet."
|
||||
assert not self.lora_config.fully_sharded_loras, \
|
||||
"Fully sharded LoRAs is not enabled in HPU yet."
|
||||
# It's necessary to distinguish between the
|
||||
# max_position_embeddings of VLMs and LLMs.
|
||||
if hasattr(self.model.config, "max_position_embeddings"):
|
||||
max_pos_embeddings = (
|
||||
self.model.config.max_position_embeddings)
|
||||
else:
|
||||
max_pos_embeddings = (
|
||||
self.model.config.text_config.max_position_embeddings)
|
||||
|
||||
# Use get_text_config() in case of multimodal models
|
||||
text_config = self.model_config.hf_config.get_text_config()
|
||||
|
||||
self.lora_manager = LRUCacheWorkerLoRAManager(
|
||||
self.scheduler_config.max_num_seqs,
|
||||
@ -741,7 +736,8 @@ class HPUModelRunnerBase(ModelRunnerBase[TModelInputForHPU]):
|
||||
self.device,
|
||||
self.model.embedding_modules,
|
||||
self.model.embedding_padding_modules,
|
||||
max_position_embeddings=max_pos_embeddings,
|
||||
max_position_embeddings=text_config.
|
||||
max_position_embeddings,
|
||||
)
|
||||
self.model = self.lora_manager.create_lora_manager(self.model)
|
||||
|
||||
|
||||
@ -1130,14 +1130,9 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
|
||||
logger.warning(
|
||||
"Regarding multimodal models, vLLM currently "
|
||||
"only supports adding LoRA to language model.")
|
||||
# It's necessary to distinguish between the
|
||||
# max_position_embeddings of VLMs and LLMs.
|
||||
if hasattr(self.model.config, "max_position_embeddings"):
|
||||
max_pos_embeddings = (
|
||||
self.model.config.max_position_embeddings)
|
||||
else:
|
||||
max_pos_embeddings = (
|
||||
self.model.config.text_config.max_position_embeddings)
|
||||
|
||||
# Use get_text_config() in case of multimodal models
|
||||
text_config = self.model_config.hf_config.get_text_config()
|
||||
|
||||
self.lora_manager = LRUCacheWorkerLoRAManager(
|
||||
self.scheduler_config.max_num_seqs,
|
||||
@ -1147,7 +1142,8 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
|
||||
self.device,
|
||||
self.model.embedding_modules,
|
||||
self.model.embedding_padding_modules,
|
||||
max_position_embeddings=max_pos_embeddings,
|
||||
max_position_embeddings=text_config.
|
||||
max_position_embeddings,
|
||||
)
|
||||
self.model = self.lora_manager.create_lora_manager(self.model)
|
||||
time_after_load = time.perf_counter()
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user