mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-23 21:23:31 +08:00
review comments
Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
This commit is contained in:
parent
2b140debbb
commit
7241acbd64
@ -44,7 +44,7 @@ class Attention(nn.Module):
|
|||||||
use_mla: bool = False,
|
use_mla: bool = False,
|
||||||
prefix: str = "",
|
prefix: str = "",
|
||||||
attn_type: str = AttentionType.DECODER,
|
attn_type: str = AttentionType.DECODER,
|
||||||
**kwargs,
|
**extra_impl_args,
|
||||||
) -> None:
|
) -> None:
|
||||||
super().__init__()
|
super().__init__()
|
||||||
if per_layer_sliding_window is not None:
|
if per_layer_sliding_window is not None:
|
||||||
@ -114,7 +114,7 @@ class Attention(nn.Module):
|
|||||||
self.impl = impl_cls(num_heads, head_size, scale, num_kv_heads,
|
self.impl = impl_cls(num_heads, head_size, scale, num_kv_heads,
|
||||||
alibi_slopes, sliding_window, kv_cache_dtype,
|
alibi_slopes, sliding_window, kv_cache_dtype,
|
||||||
blocksparse_params, logits_soft_cap, attn_type,
|
blocksparse_params, logits_soft_cap, attn_type,
|
||||||
**kwargs)
|
**extra_impl_args)
|
||||||
self.num_heads = num_heads
|
self.num_heads = num_heads
|
||||||
self.head_size = head_size
|
self.head_size = head_size
|
||||||
self.num_kv_heads = num_kv_heads
|
self.num_kv_heads = num_kv_heads
|
||||||
|
|||||||
@ -165,7 +165,6 @@ class ModelConfig:
|
|||||||
`logits_processors` extra completion argument. Defaults to None,
|
`logits_processors` extra completion argument. Defaults to None,
|
||||||
which allows no processors.
|
which allows no processors.
|
||||||
generation_config: Configuration parameter file for generation.
|
generation_config: Configuration parameter file for generation.
|
||||||
disable_mla: Whether to disable MLA for DeepSeek models.
|
|
||||||
override_generation_config: Override the generation config with the
|
override_generation_config: Override the generation config with the
|
||||||
given config.
|
given config.
|
||||||
"""
|
"""
|
||||||
@ -227,7 +226,6 @@ class ModelConfig:
|
|||||||
override_pooler_config: Optional["PoolerConfig"] = None,
|
override_pooler_config: Optional["PoolerConfig"] = None,
|
||||||
logits_processor_pattern: Optional[str] = None,
|
logits_processor_pattern: Optional[str] = None,
|
||||||
generation_config: Optional[str] = None,
|
generation_config: Optional[str] = None,
|
||||||
disable_mla: bool = False,
|
|
||||||
enable_sleep_mode: bool = False,
|
enable_sleep_mode: bool = False,
|
||||||
override_generation_config: Optional[Dict[str, Any]] = None,
|
override_generation_config: Optional[Dict[str, Any]] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
@ -278,7 +276,6 @@ class ModelConfig:
|
|||||||
self.max_logprobs = max_logprobs
|
self.max_logprobs = max_logprobs
|
||||||
self.disable_sliding_window = disable_sliding_window
|
self.disable_sliding_window = disable_sliding_window
|
||||||
self.skip_tokenizer_init = skip_tokenizer_init
|
self.skip_tokenizer_init = skip_tokenizer_init
|
||||||
self.disable_mla = disable_mla
|
|
||||||
self.enable_sleep_mode = enable_sleep_mode
|
self.enable_sleep_mode = enable_sleep_mode
|
||||||
|
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
@ -748,7 +745,7 @@ class ModelConfig:
|
|||||||
def get_head_size(self) -> int:
|
def get_head_size(self) -> int:
|
||||||
# TODO remove hard code
|
# TODO remove hard code
|
||||||
if self.is_deepseek_mla:
|
if self.is_deepseek_mla:
|
||||||
if self.should_use_mla:
|
if self.use_mla:
|
||||||
return self.hf_text_config.kv_lora_rank
|
return self.hf_text_config.kv_lora_rank
|
||||||
else:
|
else:
|
||||||
qk_rope_head_dim = getattr(self.hf_text_config,
|
qk_rope_head_dim = getattr(self.hf_text_config,
|
||||||
@ -815,7 +812,7 @@ class ModelConfig:
|
|||||||
|
|
||||||
def get_num_kv_heads(self, parallel_config: "ParallelConfig") -> int:
|
def get_num_kv_heads(self, parallel_config: "ParallelConfig") -> int:
|
||||||
"""Returns the number of KV heads per GPU."""
|
"""Returns the number of KV heads per GPU."""
|
||||||
if self.should_use_mla:
|
if self.use_mla:
|
||||||
# When using MLA during decode it becomes MQA
|
# When using MLA during decode it becomes MQA
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
@ -971,8 +968,7 @@ class ModelConfig:
|
|||||||
|
|
||||||
@property
|
@property
|
||||||
def use_mla(self) -> bool:
|
def use_mla(self) -> bool:
|
||||||
use_mla = (self.is_deepseek_mla and not self.disable_mla
|
use_mla = (self.is_deepseek_mla and not envs.VLLM_MLA_DISABLE)
|
||||||
and not envs.VLLM_MLA_DISABLE)
|
|
||||||
return use_mla
|
return use_mla
|
||||||
|
|
||||||
def supported_runner_types(self) -> Set[RunnerType]:
|
def supported_runner_types(self) -> Set[RunnerType]:
|
||||||
|
|||||||
@ -100,7 +100,6 @@ class EngineArgs:
|
|||||||
kv_cache_dtype: str = 'auto'
|
kv_cache_dtype: str = 'auto'
|
||||||
seed: int = 0
|
seed: int = 0
|
||||||
max_model_len: Optional[int] = None
|
max_model_len: Optional[int] = None
|
||||||
disable_mla: bool = False
|
|
||||||
# Note: Specifying a custom executor backend by passing a class
|
# Note: Specifying a custom executor backend by passing a class
|
||||||
# is intended for expert use only. The API may change without
|
# is intended for expert use only. The API may change without
|
||||||
# notice.
|
# notice.
|
||||||
@ -932,9 +931,6 @@ class EngineArgs:
|
|||||||
type=str,
|
type=str,
|
||||||
default="auto",
|
default="auto",
|
||||||
help='The worker class to use for distributed execution.')
|
help='The worker class to use for distributed execution.')
|
||||||
parser.add_argument('--disable-mla',
|
|
||||||
action='store_true',
|
|
||||||
help='Disable MLA for DeepSeek models.')
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--generation-config",
|
"--generation-config",
|
||||||
type=nullable_str,
|
type=nullable_str,
|
||||||
@ -1015,7 +1011,6 @@ class EngineArgs:
|
|||||||
disable_mm_preprocessor_cache=self.disable_mm_preprocessor_cache,
|
disable_mm_preprocessor_cache=self.disable_mm_preprocessor_cache,
|
||||||
override_neuron_config=self.override_neuron_config,
|
override_neuron_config=self.override_neuron_config,
|
||||||
override_pooler_config=self.override_pooler_config,
|
override_pooler_config=self.override_pooler_config,
|
||||||
disable_mla=self.disable_mla,
|
|
||||||
logits_processor_pattern=self.logits_processor_pattern,
|
logits_processor_pattern=self.logits_processor_pattern,
|
||||||
generation_config=self.generation_config,
|
generation_config=self.generation_config,
|
||||||
override_generation_config=self.override_generation_config,
|
override_generation_config=self.override_generation_config,
|
||||||
|
|||||||
@ -488,7 +488,7 @@ class DeepseekV2DecoderLayer(nn.Module):
|
|||||||
# DecoderLayers are created with `make_layers` which passes the prefix
|
# DecoderLayers are created with `make_layers` which passes the prefix
|
||||||
# with the layer's index.
|
# with the layer's index.
|
||||||
layer_idx = int(prefix.split(sep='.')[-1])
|
layer_idx = int(prefix.split(sep='.')[-1])
|
||||||
if model_config.should_use_mla:
|
if model_config.use_mla:
|
||||||
attn_cls = DeepseekV2MLAAttention
|
attn_cls = DeepseekV2MLAAttention
|
||||||
else:
|
else:
|
||||||
attn_cls = DeepseekV2Attention
|
attn_cls = DeepseekV2Attention
|
||||||
|
|||||||
@ -52,13 +52,12 @@ class CacheEngine:
|
|||||||
self.dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype]
|
self.dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype]
|
||||||
|
|
||||||
# Get attention backend.
|
# Get attention backend.
|
||||||
self.attn_backend = get_attn_backend(
|
self.attn_backend = get_attn_backend(self.head_size,
|
||||||
self.head_size,
|
model_config.dtype,
|
||||||
model_config.dtype,
|
cache_config.cache_dtype,
|
||||||
cache_config.cache_dtype,
|
self.block_size,
|
||||||
self.block_size,
|
model_config.is_attention_free,
|
||||||
model_config.is_attention_free,
|
use_mla=model_config.use_mla)
|
||||||
use_mla=model_config.should_use_mla)
|
|
||||||
|
|
||||||
# Initialize the cache.
|
# Initialize the cache.
|
||||||
self.gpu_cache = self._allocate_kv_cache(
|
self.gpu_cache = self._allocate_kv_cache(
|
||||||
|
|||||||
@ -1066,7 +1066,7 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
|
|||||||
self.kv_cache_dtype,
|
self.kv_cache_dtype,
|
||||||
self.block_size,
|
self.block_size,
|
||||||
self.model_config.is_attention_free,
|
self.model_config.is_attention_free,
|
||||||
use_mla=self.model_config.should_use_mla,
|
use_mla=self.model_config.use_mla,
|
||||||
) if needs_attn_backend else None
|
) if needs_attn_backend else None
|
||||||
if self.attn_backend:
|
if self.attn_backend:
|
||||||
self.attn_state = self.attn_backend.get_state_cls()(
|
self.attn_state = self.attn_backend.get_state_cls()(
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user