[Minor] Remove unnecessary error message

Signed-off-by: Zhuohan Li <zhuohan123@gmail.com>
This commit is contained in:
Zhuohan Li 2025-10-17 11:05:19 -07:00
parent 99722d5f0e
commit a6427280c1
2 changed files with 19 additions and 55 deletions

View File

@ -34,7 +34,7 @@ from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
from vllm.model_executor.models.vision import get_vit_attn_backend from vllm.model_executor.models.vision import get_vit_attn_backend
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils import GiB_bytes, direct_register_custom_op from vllm.utils import direct_register_custom_op
FP8_DTYPE = current_platform.fp8_dtype() FP8_DTYPE = current_platform.fp8_dtype()
logger = init_logger(__name__) logger = init_logger(__name__)
@ -281,25 +281,10 @@ class Attention(nn.Module, AttentionLayerBase):
) )
] ]
try: # Initialize q/k/v range constants.
self.q_range = torch.tensor(envs.Q_SCALE_CONSTANT, dtype=torch.float32) self.q_range = torch.tensor(envs.Q_SCALE_CONSTANT, dtype=torch.float32)
self.k_range = torch.tensor(envs.K_SCALE_CONSTANT, dtype=torch.float32) self.k_range = torch.tensor(envs.K_SCALE_CONSTANT, dtype=torch.float32)
self.v_range = torch.tensor(envs.V_SCALE_CONSTANT, dtype=torch.float32) self.v_range = torch.tensor(envs.V_SCALE_CONSTANT, dtype=torch.float32)
except torch.cuda.OutOfMemoryError as e:
logger.error("Failed to initialize attention q/k/v range constants: %s", e)
if torch.cuda.is_available():
logger.debug("CUDA device: %s", torch.cuda.current_device())
logger.debug(
"Allocated: %.2f GiB", torch.cuda.memory_allocated() / GiB_bytes
)
logger.debug(
"Reserved: %.2f GiB", torch.cuda.memory_reserved() / GiB_bytes
)
raise RuntimeError(
"Failed to initialize q/k/v range constants. "
"This may be caused by insufficient memory to allocate "
"kv cache."
) from e
# for attn backends supporting query quantization # for attn backends supporting query quantization
self.query_quant = None self.query_quant = None
@ -668,13 +653,9 @@ class MLAAttention(nn.Module, AttentionLayerBase):
self.use_sparse = use_sparse self.use_sparse = use_sparse
# Initialize q/k/v range constants. # Initialize q/k/v range constants.
try: self.q_range = torch.tensor(envs.Q_SCALE_CONSTANT, dtype=torch.float32)
self.q_range = torch.tensor(envs.Q_SCALE_CONSTANT, dtype=torch.float32) self.k_range = torch.tensor(envs.K_SCALE_CONSTANT, dtype=torch.float32)
self.k_range = torch.tensor(envs.K_SCALE_CONSTANT, dtype=torch.float32) self.v_range = torch.tensor(envs.V_SCALE_CONSTANT, dtype=torch.float32)
self.v_range = torch.tensor(envs.V_SCALE_CONSTANT, dtype=torch.float32)
except torch.cuda.OutOfMemoryError:
# Keep defaults if allocation fails; not critical for init.
pass
def forward( def forward(
self, self,

View File

@ -34,7 +34,6 @@ from vllm.model_executor.parameter import (
) )
from vllm.model_executor.utils import set_weight_attrs from vllm.model_executor.utils import set_weight_attrs
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils import GiB_bytes
logger = init_logger(__name__) logger = init_logger(__name__)
@ -211,33 +210,17 @@ class UnquantizedLinearMethod(LinearMethodBase):
# The weights are not quantized, and they are not sharded. # The weights are not quantized, and they are not sharded.
# The amount of memory allocated for the weights is # The amount of memory allocated for the weights is
# sum(output_partition_sizes) * input_size_per_partition. # sum(output_partition_sizes) * input_size_per_partition.
try: weight_loader = extra_weight_attrs.pop("weight_loader")
weight_loader = extra_weight_attrs.pop("weight_loader") weight = ModelWeightParameter(
weight = ModelWeightParameter( data=torch.empty(
data=torch.empty( sum(output_partition_sizes),
sum(output_partition_sizes), input_size_per_partition,
input_size_per_partition, dtype=params_dtype,
dtype=params_dtype, ),
), input_dim=1,
input_dim=1, output_dim=0,
output_dim=0, weight_loader=weight_loader,
weight_loader=weight_loader, )
)
except torch.cuda.OutOfMemoryError as e:
logger.error("Failed to create unquantized linear weights: %s", e)
if torch.cuda.is_available():
logger.debug("CUDA device: %s", torch.cuda.current_device())
logger.debug(
"Allocated: %.2f GiB", torch.cuda.memory_allocated() / GiB_bytes
)
logger.debug(
"Reserved: %.2f GiB", torch.cuda.memory_reserved() / GiB_bytes
)
raise RuntimeError(
"Failed to create unquantized linear weights. "
"This may be caused by insufficient memory to allocate "
"the weight."
) from e
layer.register_parameter("weight", weight) layer.register_parameter("weight", weight)
set_weight_attrs(weight, extra_weight_attrs) set_weight_attrs(weight, extra_weight_attrs)