From f5710ef02a230c55827c5843218801718f0b19df Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 29 Oct 2025 00:23:35 +0800 Subject: [PATCH] [Misc] Make `LayerBlockType` a `Literal` instead of `Enum` (#27658) Signed-off-by: DarkLight1337 --- vllm/config/model.py | 26 ++++++++++---------------- vllm/utils/__init__.py | 6 ------ vllm/v1/worker/tpu_model_runner.py | 3 +-- 3 files changed, 11 insertions(+), 24 deletions(-) diff --git a/vllm/config/model.py b/vllm/config/model.py index a488258c56da2..e22c218c769da 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -41,7 +41,6 @@ from vllm.transformers_utils.config import ( ) from vllm.transformers_utils.runai_utils import ObjectStorageModel, is_runai_obj_uri from vllm.transformers_utils.utils import maybe_model_redirect -from vllm.utils import LayerBlockType from vllm.utils.import_utils import LazyLoader from vllm.utils.torch_utils import common_broadcastable_dtype @@ -91,6 +90,7 @@ LogprobsMode = Literal[ ] HfOverrides = dict[str, Any] | Callable[[PretrainedConfig], PretrainedConfig] ModelImpl = Literal["auto", "vllm", "transformers", "terratorch"] +LayerBlockType = Literal["attention", "linear_attention", "mamba"] _RUNNER_TASKS: dict[RunnerType, list[TaskOption]] = { "generate": ["generate", "transcription"], @@ -1433,11 +1433,11 @@ class ModelConfig: def get_num_layers_by_block_type( self, parallel_config: ParallelConfig, - block_type: LayerBlockType = LayerBlockType.attention, + block_type: LayerBlockType = "attention", ) -> int: # This function relies on 'layers_block_type' in hf_config, # for w/o this attribute, we will need to have workarounds like so - attn_block_type = block_type == LayerBlockType.attention + attn_block_type = block_type == "attention" is_transformer = ( not self.is_hybrid and not self.has_noops and not self.is_attention_free ) @@ -1469,9 +1469,7 @@ class ModelConfig: ) else: return self.get_num_layers(parallel_config) - return sum( - t == block_type.value for t in layers_block_type_value[start:end] - ) + return sum(t == block_type for t in layers_block_type_value[start:end]) # Hybrid model Minimax attn_type_list = getattr(self.hf_config, "attn_type_list", None) @@ -1481,19 +1479,16 @@ class ModelConfig: # Hybrid model Qwen3Next layer_types_value = getattr(self.hf_config, "layer_types", None) if layer_types_value is not None: - if getattr(block_type, "value", block_type) == "attention": + if block_type == "attention": return sum( t == "full_attention" for t in layer_types_value[start:end] ) - elif getattr(block_type, "value", block_type) == "linear_attention": + elif block_type == "linear_attention": return sum( t == "linear_attention" for t in layer_types_value[start:end] ) else: - return sum( - t == getattr(block_type, "value", block_type) - for t in layer_types_value[start:end] - ) + return sum(t == block_type for t in layer_types_value[start:end]) if ( layers_block_type_value is None @@ -1501,10 +1496,9 @@ class ModelConfig: and layer_types_value is None ): raise ValueError( - "The model is an hybrid without a" - "layers_block_type or an attn_type_list, or a layer_types " - "in the hf_config, cannot determine the num of " - f"{block_type.value} layers" + "The model is an hybrid without a layers_block_type or an " + "attn_type_list, or a layer_types in the hf_config, " + f"cannot determine the num of {block_type} layers" ) def get_mamba_chunk_size(self) -> int | None: diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index 549827a927d9a..b5a7fea2c3571 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import enum import inspect import uuid import warnings @@ -67,11 +66,6 @@ STR_INVALID_VAL: str = "INVALID" T = TypeVar("T") -class LayerBlockType(enum.Enum): - attention = "attention" - mamba = "mamba" - - def random_uuid() -> str: return str(uuid.uuid4().hex) diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index ce769e8575ffd..5d7b181989ce5 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -53,7 +53,6 @@ from vllm.multimodal.inputs import ( from vllm.multimodal.utils import group_mm_kwargs_by_modality from vllm.sequence import IntermediateTensors from vllm.tasks import GenerationTask, PoolingTask, SupportedTask -from vllm.utils import LayerBlockType from vllm.utils.math_utils import cdiv, prev_power_of_2 from vllm.utils.platform_utils import is_pin_memory_available from vllm.v1.attention.backends.pallas import ( @@ -212,7 +211,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # Model-related. self.num_attn_layers = model_config.get_num_layers_by_block_type( - parallel_config, LayerBlockType.attention + parallel_config, "attention" ) self.num_query_heads = model_config.get_num_attention_heads(parallel_config) self.num_kv_heads = model_config.get_num_kv_heads(parallel_config)