mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-03-25 10:53:41 +08:00
review comments
Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
This commit is contained in:
parent
aa19f297d2
commit
534cd0006d
@ -748,8 +748,6 @@ class ModelConfig:
|
||||
def get_head_size(self) -> int:
|
||||
# TODO remove hard code
|
||||
if self.is_deepseek_mla:
|
||||
# FlashAttention supports only head_size 32, 64, 128, 256,
|
||||
# we need to pad head_size 192 to 256
|
||||
if self.should_use_mla:
|
||||
return self.hf_text_config.kv_lora_rank
|
||||
else:
|
||||
@ -974,7 +972,7 @@ class ModelConfig:
|
||||
@property
|
||||
def should_use_mla(self) -> bool:
|
||||
use_mla = (self.is_deepseek_mla and not self.disable_mla
|
||||
and not envs.VLLM_DISABLE_MLA)
|
||||
and not envs.VLLM_MLA_DISABLE)
|
||||
return use_mla
|
||||
|
||||
def supported_runner_types(self) -> Set[RunnerType]:
|
||||
|
||||
@ -77,6 +77,7 @@ if TYPE_CHECKING:
|
||||
V_SCALE_CONSTANT: int = 100
|
||||
VLLM_SERVER_DEV_MODE: bool = False
|
||||
VLLM_V1_OUTPUT_PROC_CHUNK_SIZE: int = 128
|
||||
VLLM_MLA_DISABLE: bool = False
|
||||
VLLM_MLA_PERFORM_MATRIX_ABSORPTION: bool = True
|
||||
|
||||
|
||||
@ -302,10 +303,6 @@ environment_variables: Dict[str, Callable[[], Any]] = {
|
||||
"VLLM_FLASHINFER_FORCE_TENSOR_CORES":
|
||||
lambda: bool(int(os.getenv("VLLM_FLASHINFER_FORCE_TENSOR_CORES", "0"))),
|
||||
|
||||
# If set, vLLM will disable the MLA attention optimizations.
|
||||
"VLLM_DISABLE_MLA":
|
||||
lambda: bool(int(os.getenv("VLLM_DISABLE_MLA", "0"))),
|
||||
|
||||
# Pipeline stage partition strategy
|
||||
"VLLM_PP_LAYER_PARTITION":
|
||||
lambda: os.getenv("VLLM_PP_LAYER_PARTITION", None),
|
||||
@ -512,6 +509,10 @@ environment_variables: Dict[str, Callable[[], Any]] = {
|
||||
"VLLM_V1_OUTPUT_PROC_CHUNK_SIZE":
|
||||
lambda: int(os.getenv("VLLM_V1_OUTPUT_PROC_CHUNK_SIZE", "128")),
|
||||
|
||||
# If set, vLLM will disable the MLA attention optimizations.
|
||||
"VLLM_MLA_DISABLE":
|
||||
lambda: bool(int(os.getenv("VLLM_MLA_DISABLE", "0"))),
|
||||
|
||||
# Flag that can control whether or not we perform matrix-absorption for MLA
|
||||
# decode, i.e. absorb W_UK into W_Q/W_UK and W_UV into W_O, absorbing the
|
||||
# matrices reduces the runtime FLOPs needed to compute MLA but requires
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user