From 7241acbd64cf25644a956c97da9cbfefb3c413ab Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Thu, 30 Jan 2025 14:09:46 +0000 Subject: [PATCH] review comments Signed-off-by: Lucas Wilkinson --- vllm/attention/layer.py | 4 ++-- vllm/config.py | 10 +++------- vllm/engine/arg_utils.py | 5 ----- vllm/model_executor/models/deepseek_v2.py | 2 +- vllm/worker/cache_engine.py | 13 ++++++------- vllm/worker/model_runner.py | 2 +- 6 files changed, 13 insertions(+), 23 deletions(-) diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 866787a297cda..9b804a29a485d 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -44,7 +44,7 @@ class Attention(nn.Module): use_mla: bool = False, prefix: str = "", attn_type: str = AttentionType.DECODER, - **kwargs, + **extra_impl_args, ) -> None: super().__init__() if per_layer_sliding_window is not None: @@ -114,7 +114,7 @@ class Attention(nn.Module): self.impl = impl_cls(num_heads, head_size, scale, num_kv_heads, alibi_slopes, sliding_window, kv_cache_dtype, blocksparse_params, logits_soft_cap, attn_type, - **kwargs) + **extra_impl_args) self.num_heads = num_heads self.head_size = head_size self.num_kv_heads = num_kv_heads diff --git a/vllm/config.py b/vllm/config.py index 82d6144700df4..337224551a300 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -165,7 +165,6 @@ class ModelConfig: `logits_processors` extra completion argument. Defaults to None, which allows no processors. generation_config: Configuration parameter file for generation. - disable_mla: Whether to disable MLA for DeepSeek models. override_generation_config: Override the generation config with the given config. """ @@ -227,7 +226,6 @@ class ModelConfig: override_pooler_config: Optional["PoolerConfig"] = None, logits_processor_pattern: Optional[str] = None, generation_config: Optional[str] = None, - disable_mla: bool = False, enable_sleep_mode: bool = False, override_generation_config: Optional[Dict[str, Any]] = None, ) -> None: @@ -278,7 +276,6 @@ class ModelConfig: self.max_logprobs = max_logprobs self.disable_sliding_window = disable_sliding_window self.skip_tokenizer_init = skip_tokenizer_init - self.disable_mla = disable_mla self.enable_sleep_mode = enable_sleep_mode from vllm.platforms import current_platform @@ -748,7 +745,7 @@ class ModelConfig: def get_head_size(self) -> int: # TODO remove hard code if self.is_deepseek_mla: - if self.should_use_mla: + if self.use_mla: return self.hf_text_config.kv_lora_rank else: qk_rope_head_dim = getattr(self.hf_text_config, @@ -815,7 +812,7 @@ class ModelConfig: def get_num_kv_heads(self, parallel_config: "ParallelConfig") -> int: """Returns the number of KV heads per GPU.""" - if self.should_use_mla: + if self.use_mla: # When using MLA during decode it becomes MQA return 1 @@ -971,8 +968,7 @@ class ModelConfig: @property def use_mla(self) -> bool: - use_mla = (self.is_deepseek_mla and not self.disable_mla - and not envs.VLLM_MLA_DISABLE) + use_mla = (self.is_deepseek_mla and not envs.VLLM_MLA_DISABLE) return use_mla def supported_runner_types(self) -> Set[RunnerType]: diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index b066479374c5b..cc7c99e50ac4d 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -100,7 +100,6 @@ class EngineArgs: kv_cache_dtype: str = 'auto' seed: int = 0 max_model_len: Optional[int] = None - disable_mla: bool = False # Note: Specifying a custom executor backend by passing a class # is intended for expert use only. The API may change without # notice. @@ -932,9 +931,6 @@ class EngineArgs: type=str, default="auto", help='The worker class to use for distributed execution.') - parser.add_argument('--disable-mla', - action='store_true', - help='Disable MLA for DeepSeek models.') parser.add_argument( "--generation-config", type=nullable_str, @@ -1015,7 +1011,6 @@ class EngineArgs: disable_mm_preprocessor_cache=self.disable_mm_preprocessor_cache, override_neuron_config=self.override_neuron_config, override_pooler_config=self.override_pooler_config, - disable_mla=self.disable_mla, logits_processor_pattern=self.logits_processor_pattern, generation_config=self.generation_config, override_generation_config=self.override_generation_config, diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index 538668927b72c..28ae50e0770ea 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -488,7 +488,7 @@ class DeepseekV2DecoderLayer(nn.Module): # DecoderLayers are created with `make_layers` which passes the prefix # with the layer's index. layer_idx = int(prefix.split(sep='.')[-1]) - if model_config.should_use_mla: + if model_config.use_mla: attn_cls = DeepseekV2MLAAttention else: attn_cls = DeepseekV2Attention diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py index d960f53f6d4de..08316ba74aad8 100644 --- a/vllm/worker/cache_engine.py +++ b/vllm/worker/cache_engine.py @@ -52,13 +52,12 @@ class CacheEngine: self.dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype] # Get attention backend. - self.attn_backend = get_attn_backend( - self.head_size, - model_config.dtype, - cache_config.cache_dtype, - self.block_size, - model_config.is_attention_free, - use_mla=model_config.should_use_mla) + self.attn_backend = get_attn_backend(self.head_size, + model_config.dtype, + cache_config.cache_dtype, + self.block_size, + model_config.is_attention_free, + use_mla=model_config.use_mla) # Initialize the cache. self.gpu_cache = self._allocate_kv_cache( diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index d6c85d9b04983..b6ed3abab4247 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -1066,7 +1066,7 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]): self.kv_cache_dtype, self.block_size, self.model_config.is_attention_free, - use_mla=self.model_config.should_use_mla, + use_mla=self.model_config.use_mla, ) if needs_attn_backend else None if self.attn_backend: self.attn_state = self.attn_backend.get_state_cls()(