diff --git a/tests/models/utils.py b/tests/models/utils.py index d84b4b820533e..479c056d543ec 100644 --- a/tests/models/utils.py +++ b/tests/models/utils.py @@ -14,6 +14,9 @@ from vllm.config.model import ModelConfig, ModelDType, RunnerOption from vllm.logprobs import Logprob, PromptLogprobs, SampleLogprobs from vllm.multimodal.processing import InputProcessingContext from vllm.tokenizers import cached_tokenizer_from_config +from vllm.transformers_utils.model_arch_config_convertor import ( + ModelArchConfigConvertorBase, +) from .. import ci_envs from .registry import HF_EXAMPLE_MODELS @@ -488,7 +491,7 @@ def dummy_hf_overrides( # Only set MoE related config when the model has MoE layers. # Otherwise all models detected as MoE by _get_transformers_backend_cls. - if ModelConfig.get_num_experts(DummyConfig) > 0: + if ModelArchConfigConvertorBase.get_num_experts(text_config) > 0: update_dict.update( { "num_experts": num_experts, diff --git a/vllm/transformers_utils/model_arch_config_convertor.py b/vllm/transformers_utils/model_arch_config_convertor.py index 999f46083c6e0..16fd5b6b6dcbd 100644 --- a/vllm/transformers_utils/model_arch_config_convertor.py +++ b/vllm/transformers_utils/model_arch_config_convertor.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, final import torch from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE @@ -88,7 +88,9 @@ class ModelArchConfigConvertorBase: return self.hf_text_config.num_attention_heads - def get_num_experts(self) -> int: + @final + @classmethod + def get_num_experts(cls, hf_text_config: PretrainedConfig) -> int: """Returns the number of experts in the model.""" num_expert_names = [ "num_experts", # Jamba @@ -96,7 +98,7 @@ class ModelArchConfigConvertorBase: "n_routed_experts", # DeepSeek "num_local_experts", # Mixtral ] - num_experts = getattr_iter(self.hf_text_config, num_expert_names, 0) + num_experts = getattr_iter(hf_text_config, num_expert_names, 0) if isinstance(num_experts, list): # Ernie VL's remote code uses list[int]... # The values are always the same so we just take the first one. @@ -104,8 +106,11 @@ class ModelArchConfigConvertorBase: # Coerce to 0 if explicitly set to None return num_experts or 0 + @final @classmethod - def get_torch_dtype(cls, hf_config, model_id: str, revision: str | None): + def get_torch_dtype( + cls, hf_config: PretrainedConfig, model_id: str, revision: str | None + ): # NOTE: getattr(config, "dtype", torch.float32) is not correct # because config.dtype can be None. config_dtype = getattr(hf_config, "dtype", None) @@ -139,6 +144,7 @@ class ModelArchConfigConvertorBase: return config_dtype + @final @classmethod def _normalize_quantization_config(cls, config: PretrainedConfig): quant_cfg = getattr(config, "quantization_config", None) @@ -171,6 +177,7 @@ class ModelArchConfigConvertorBase: return quant_cfg + @final @classmethod def get_quantization_config(cls, hf_config: PretrainedConfig): quant_cfg = cls._normalize_quantization_config(hf_config) @@ -258,7 +265,7 @@ class ModelArchConfigConvertorBase: head_size=self.get_head_size(), vocab_size=self.get_vocab_size(), total_num_kv_heads=self.get_total_num_kv_heads(), - num_experts=self.get_num_experts(), + num_experts=self.get_num_experts(self.hf_text_config), quantization_config=self.get_quantization_config(self.hf_config), torch_dtype=self.get_torch_dtype(self.hf_config, model_id, revision), is_multimodal_model=self.is_multimodal_model(),