mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-03-16 11:47:09 +08:00
[Minor] Remove unnecessary error message
Signed-off-by: Zhuohan Li <zhuohan123@gmail.com>
This commit is contained in:
parent
99722d5f0e
commit
a6427280c1
@ -34,7 +34,7 @@ from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
|
|||||||
from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
|
from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
|
||||||
from vllm.model_executor.models.vision import get_vit_attn_backend
|
from vllm.model_executor.models.vision import get_vit_attn_backend
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.utils import GiB_bytes, direct_register_custom_op
|
from vllm.utils import direct_register_custom_op
|
||||||
|
|
||||||
FP8_DTYPE = current_platform.fp8_dtype()
|
FP8_DTYPE = current_platform.fp8_dtype()
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
@ -281,25 +281,10 @@ class Attention(nn.Module, AttentionLayerBase):
|
|||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
|
||||||
try:
|
# Initialize q/k/v range constants.
|
||||||
self.q_range = torch.tensor(envs.Q_SCALE_CONSTANT, dtype=torch.float32)
|
self.q_range = torch.tensor(envs.Q_SCALE_CONSTANT, dtype=torch.float32)
|
||||||
self.k_range = torch.tensor(envs.K_SCALE_CONSTANT, dtype=torch.float32)
|
self.k_range = torch.tensor(envs.K_SCALE_CONSTANT, dtype=torch.float32)
|
||||||
self.v_range = torch.tensor(envs.V_SCALE_CONSTANT, dtype=torch.float32)
|
self.v_range = torch.tensor(envs.V_SCALE_CONSTANT, dtype=torch.float32)
|
||||||
except torch.cuda.OutOfMemoryError as e:
|
|
||||||
logger.error("Failed to initialize attention q/k/v range constants: %s", e)
|
|
||||||
if torch.cuda.is_available():
|
|
||||||
logger.debug("CUDA device: %s", torch.cuda.current_device())
|
|
||||||
logger.debug(
|
|
||||||
"Allocated: %.2f GiB", torch.cuda.memory_allocated() / GiB_bytes
|
|
||||||
)
|
|
||||||
logger.debug(
|
|
||||||
"Reserved: %.2f GiB", torch.cuda.memory_reserved() / GiB_bytes
|
|
||||||
)
|
|
||||||
raise RuntimeError(
|
|
||||||
"Failed to initialize q/k/v range constants. "
|
|
||||||
"This may be caused by insufficient memory to allocate "
|
|
||||||
"kv cache."
|
|
||||||
) from e
|
|
||||||
|
|
||||||
# for attn backends supporting query quantization
|
# for attn backends supporting query quantization
|
||||||
self.query_quant = None
|
self.query_quant = None
|
||||||
@ -668,13 +653,9 @@ class MLAAttention(nn.Module, AttentionLayerBase):
|
|||||||
self.use_sparse = use_sparse
|
self.use_sparse = use_sparse
|
||||||
|
|
||||||
# Initialize q/k/v range constants.
|
# Initialize q/k/v range constants.
|
||||||
try:
|
|
||||||
self.q_range = torch.tensor(envs.Q_SCALE_CONSTANT, dtype=torch.float32)
|
self.q_range = torch.tensor(envs.Q_SCALE_CONSTANT, dtype=torch.float32)
|
||||||
self.k_range = torch.tensor(envs.K_SCALE_CONSTANT, dtype=torch.float32)
|
self.k_range = torch.tensor(envs.K_SCALE_CONSTANT, dtype=torch.float32)
|
||||||
self.v_range = torch.tensor(envs.V_SCALE_CONSTANT, dtype=torch.float32)
|
self.v_range = torch.tensor(envs.V_SCALE_CONSTANT, dtype=torch.float32)
|
||||||
except torch.cuda.OutOfMemoryError:
|
|
||||||
# Keep defaults if allocation fails; not critical for init.
|
|
||||||
pass
|
|
||||||
|
|
||||||
def forward(
|
def forward(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@ -34,7 +34,6 @@ from vllm.model_executor.parameter import (
|
|||||||
)
|
)
|
||||||
from vllm.model_executor.utils import set_weight_attrs
|
from vllm.model_executor.utils import set_weight_attrs
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.utils import GiB_bytes
|
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
@ -211,7 +210,6 @@ class UnquantizedLinearMethod(LinearMethodBase):
|
|||||||
# The weights are not quantized, and they are not sharded.
|
# The weights are not quantized, and they are not sharded.
|
||||||
# The amount of memory allocated for the weights is
|
# The amount of memory allocated for the weights is
|
||||||
# sum(output_partition_sizes) * input_size_per_partition.
|
# sum(output_partition_sizes) * input_size_per_partition.
|
||||||
try:
|
|
||||||
weight_loader = extra_weight_attrs.pop("weight_loader")
|
weight_loader = extra_weight_attrs.pop("weight_loader")
|
||||||
weight = ModelWeightParameter(
|
weight = ModelWeightParameter(
|
||||||
data=torch.empty(
|
data=torch.empty(
|
||||||
@ -223,21 +221,6 @@ class UnquantizedLinearMethod(LinearMethodBase):
|
|||||||
output_dim=0,
|
output_dim=0,
|
||||||
weight_loader=weight_loader,
|
weight_loader=weight_loader,
|
||||||
)
|
)
|
||||||
except torch.cuda.OutOfMemoryError as e:
|
|
||||||
logger.error("Failed to create unquantized linear weights: %s", e)
|
|
||||||
if torch.cuda.is_available():
|
|
||||||
logger.debug("CUDA device: %s", torch.cuda.current_device())
|
|
||||||
logger.debug(
|
|
||||||
"Allocated: %.2f GiB", torch.cuda.memory_allocated() / GiB_bytes
|
|
||||||
)
|
|
||||||
logger.debug(
|
|
||||||
"Reserved: %.2f GiB", torch.cuda.memory_reserved() / GiB_bytes
|
|
||||||
)
|
|
||||||
raise RuntimeError(
|
|
||||||
"Failed to create unquantized linear weights. "
|
|
||||||
"This may be caused by insufficient memory to allocate "
|
|
||||||
"the weight."
|
|
||||||
) from e
|
|
||||||
|
|
||||||
layer.register_parameter("weight", weight)
|
layer.register_parameter("weight", weight)
|
||||||
set_weight_attrs(weight, extra_weight_attrs)
|
set_weight_attrs(weight, extra_weight_attrs)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user