review comments

Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
This commit is contained in:
Lucas Wilkinson 2025-01-30 14:09:46 +00:00
parent 2b140debbb
commit 7241acbd64
6 changed files with 13 additions and 23 deletions

View File

@ -44,7 +44,7 @@ class Attention(nn.Module):
use_mla: bool = False, use_mla: bool = False,
prefix: str = "", prefix: str = "",
attn_type: str = AttentionType.DECODER, attn_type: str = AttentionType.DECODER,
**kwargs, **extra_impl_args,
) -> None: ) -> None:
super().__init__() super().__init__()
if per_layer_sliding_window is not None: if per_layer_sliding_window is not None:
@ -114,7 +114,7 @@ class Attention(nn.Module):
self.impl = impl_cls(num_heads, head_size, scale, num_kv_heads, self.impl = impl_cls(num_heads, head_size, scale, num_kv_heads,
alibi_slopes, sliding_window, kv_cache_dtype, alibi_slopes, sliding_window, kv_cache_dtype,
blocksparse_params, logits_soft_cap, attn_type, blocksparse_params, logits_soft_cap, attn_type,
**kwargs) **extra_impl_args)
self.num_heads = num_heads self.num_heads = num_heads
self.head_size = head_size self.head_size = head_size
self.num_kv_heads = num_kv_heads self.num_kv_heads = num_kv_heads

View File

@ -165,7 +165,6 @@ class ModelConfig:
`logits_processors` extra completion argument. Defaults to None, `logits_processors` extra completion argument. Defaults to None,
which allows no processors. which allows no processors.
generation_config: Configuration parameter file for generation. generation_config: Configuration parameter file for generation.
disable_mla: Whether to disable MLA for DeepSeek models.
override_generation_config: Override the generation config with the override_generation_config: Override the generation config with the
given config. given config.
""" """
@ -227,7 +226,6 @@ class ModelConfig:
override_pooler_config: Optional["PoolerConfig"] = None, override_pooler_config: Optional["PoolerConfig"] = None,
logits_processor_pattern: Optional[str] = None, logits_processor_pattern: Optional[str] = None,
generation_config: Optional[str] = None, generation_config: Optional[str] = None,
disable_mla: bool = False,
enable_sleep_mode: bool = False, enable_sleep_mode: bool = False,
override_generation_config: Optional[Dict[str, Any]] = None, override_generation_config: Optional[Dict[str, Any]] = None,
) -> None: ) -> None:
@ -278,7 +276,6 @@ class ModelConfig:
self.max_logprobs = max_logprobs self.max_logprobs = max_logprobs
self.disable_sliding_window = disable_sliding_window self.disable_sliding_window = disable_sliding_window
self.skip_tokenizer_init = skip_tokenizer_init self.skip_tokenizer_init = skip_tokenizer_init
self.disable_mla = disable_mla
self.enable_sleep_mode = enable_sleep_mode self.enable_sleep_mode = enable_sleep_mode
from vllm.platforms import current_platform from vllm.platforms import current_platform
@ -748,7 +745,7 @@ class ModelConfig:
def get_head_size(self) -> int: def get_head_size(self) -> int:
# TODO remove hard code # TODO remove hard code
if self.is_deepseek_mla: if self.is_deepseek_mla:
if self.should_use_mla: if self.use_mla:
return self.hf_text_config.kv_lora_rank return self.hf_text_config.kv_lora_rank
else: else:
qk_rope_head_dim = getattr(self.hf_text_config, qk_rope_head_dim = getattr(self.hf_text_config,
@ -815,7 +812,7 @@ class ModelConfig:
def get_num_kv_heads(self, parallel_config: "ParallelConfig") -> int: def get_num_kv_heads(self, parallel_config: "ParallelConfig") -> int:
"""Returns the number of KV heads per GPU.""" """Returns the number of KV heads per GPU."""
if self.should_use_mla: if self.use_mla:
# When using MLA during decode it becomes MQA # When using MLA during decode it becomes MQA
return 1 return 1
@ -971,8 +968,7 @@ class ModelConfig:
@property @property
def use_mla(self) -> bool: def use_mla(self) -> bool:
use_mla = (self.is_deepseek_mla and not self.disable_mla use_mla = (self.is_deepseek_mla and not envs.VLLM_MLA_DISABLE)
and not envs.VLLM_MLA_DISABLE)
return use_mla return use_mla
def supported_runner_types(self) -> Set[RunnerType]: def supported_runner_types(self) -> Set[RunnerType]:

View File

@ -100,7 +100,6 @@ class EngineArgs:
kv_cache_dtype: str = 'auto' kv_cache_dtype: str = 'auto'
seed: int = 0 seed: int = 0
max_model_len: Optional[int] = None max_model_len: Optional[int] = None
disable_mla: bool = False
# Note: Specifying a custom executor backend by passing a class # Note: Specifying a custom executor backend by passing a class
# is intended for expert use only. The API may change without # is intended for expert use only. The API may change without
# notice. # notice.
@ -932,9 +931,6 @@ class EngineArgs:
type=str, type=str,
default="auto", default="auto",
help='The worker class to use for distributed execution.') help='The worker class to use for distributed execution.')
parser.add_argument('--disable-mla',
action='store_true',
help='Disable MLA for DeepSeek models.')
parser.add_argument( parser.add_argument(
"--generation-config", "--generation-config",
type=nullable_str, type=nullable_str,
@ -1015,7 +1011,6 @@ class EngineArgs:
disable_mm_preprocessor_cache=self.disable_mm_preprocessor_cache, disable_mm_preprocessor_cache=self.disable_mm_preprocessor_cache,
override_neuron_config=self.override_neuron_config, override_neuron_config=self.override_neuron_config,
override_pooler_config=self.override_pooler_config, override_pooler_config=self.override_pooler_config,
disable_mla=self.disable_mla,
logits_processor_pattern=self.logits_processor_pattern, logits_processor_pattern=self.logits_processor_pattern,
generation_config=self.generation_config, generation_config=self.generation_config,
override_generation_config=self.override_generation_config, override_generation_config=self.override_generation_config,

View File

@ -488,7 +488,7 @@ class DeepseekV2DecoderLayer(nn.Module):
# DecoderLayers are created with `make_layers` which passes the prefix # DecoderLayers are created with `make_layers` which passes the prefix
# with the layer's index. # with the layer's index.
layer_idx = int(prefix.split(sep='.')[-1]) layer_idx = int(prefix.split(sep='.')[-1])
if model_config.should_use_mla: if model_config.use_mla:
attn_cls = DeepseekV2MLAAttention attn_cls = DeepseekV2MLAAttention
else: else:
attn_cls = DeepseekV2Attention attn_cls = DeepseekV2Attention

View File

@ -52,13 +52,12 @@ class CacheEngine:
self.dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype] self.dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype]
# Get attention backend. # Get attention backend.
self.attn_backend = get_attn_backend( self.attn_backend = get_attn_backend(self.head_size,
self.head_size, model_config.dtype,
model_config.dtype, cache_config.cache_dtype,
cache_config.cache_dtype, self.block_size,
self.block_size, model_config.is_attention_free,
model_config.is_attention_free, use_mla=model_config.use_mla)
use_mla=model_config.should_use_mla)
# Initialize the cache. # Initialize the cache.
self.gpu_cache = self._allocate_kv_cache( self.gpu_cache = self._allocate_kv_cache(

View File

@ -1066,7 +1066,7 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
self.kv_cache_dtype, self.kv_cache_dtype,
self.block_size, self.block_size,
self.model_config.is_attention_free, self.model_config.is_attention_free,
use_mla=self.model_config.should_use_mla, use_mla=self.model_config.use_mla,
) if needs_attn_backend else None ) if needs_attn_backend else None
if self.attn_backend: if self.attn_backend:
self.attn_state = self.attn_backend.get_state_cls()( self.attn_state = self.attn_backend.get_state_cls()(