[Minor] Remove unnecessary error message

Signed-off-by: Zhuohan Li <zhuohan123@gmail.com>
2026-03-16 11:47:09 +08:00 · 2025-10-17 11:05:19 -07:00 · 2025-10-17 11:05:19 -07:00 · a6427280c1
commit a6427280c1
parent 99722d5f0e
2 changed files with 19 additions and 55 deletions
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@ -34,7 +34,7 @@ from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
 from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
 from vllm.model_executor.models.vision import get_vit_attn_backend
 from vllm.platforms import current_platform
-from vllm.utils import GiB_bytes, direct_register_custom_op
+from vllm.utils import direct_register_custom_op
 FP8_DTYPE = current_platform.fp8_dtype()
 logger = init_logger(__name__)
@ -281,25 +281,10 @@ class Attention(nn.Module, AttentionLayerBase):
            )
        ]
-        try:
+        # Initialize q/k/v range constants.
        self.q_range = torch.tensor(envs.Q_SCALE_CONSTANT, dtype=torch.float32)
        self.k_range = torch.tensor(envs.K_SCALE_CONSTANT, dtype=torch.float32)
        self.v_range = torch.tensor(envs.V_SCALE_CONSTANT, dtype=torch.float32)
        except torch.cuda.OutOfMemoryError as e:
            logger.error("Failed to initialize attention q/k/v range constants: %s", e)
            if torch.cuda.is_available():
                logger.debug("CUDA device: %s", torch.cuda.current_device())
                logger.debug(
                    "Allocated: %.2f GiB", torch.cuda.memory_allocated() / GiB_bytes
                )
                logger.debug(
                    "Reserved: %.2f GiB", torch.cuda.memory_reserved() / GiB_bytes
                )
            raise RuntimeError(
                "Failed to initialize q/k/v range constants. "
                "This may be caused by insufficient memory to allocate "
                "kv cache."
            ) from e
        # for attn backends supporting query quantization
        self.query_quant = None
@ -668,13 +653,9 @@ class MLAAttention(nn.Module, AttentionLayerBase):
        self.use_sparse = use_sparse
        # Initialize q/k/v range constants.
        try:
        self.q_range = torch.tensor(envs.Q_SCALE_CONSTANT, dtype=torch.float32)
        self.k_range = torch.tensor(envs.K_SCALE_CONSTANT, dtype=torch.float32)
        self.v_range = torch.tensor(envs.V_SCALE_CONSTANT, dtype=torch.float32)
        except torch.cuda.OutOfMemoryError:
            # Keep defaults if allocation fails; not critical for init.
            pass
    def forward(
        self,
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@ -34,7 +34,6 @@ from vllm.model_executor.parameter import (
 )
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 from vllm.utils import GiB_bytes
 logger = init_logger(__name__)
@ -211,7 +210,6 @@ class UnquantizedLinearMethod(LinearMethodBase):
        # The weights are not quantized, and they are not sharded.
        # The amount of memory allocated for the weights is
        # sum(output_partition_sizes) * input_size_per_partition.
        try:
        weight_loader = extra_weight_attrs.pop("weight_loader")
        weight = ModelWeightParameter(
            data=torch.empty(
@ -223,21 +221,6 @@ class UnquantizedLinearMethod(LinearMethodBase):
            output_dim=0,
            weight_loader=weight_loader,
        )
        except torch.cuda.OutOfMemoryError as e:
            logger.error("Failed to create unquantized linear weights: %s", e)
            if torch.cuda.is_available():
                logger.debug("CUDA device: %s", torch.cuda.current_device())
                logger.debug(
                    "Allocated: %.2f GiB", torch.cuda.memory_allocated() / GiB_bytes
                )
                logger.debug(
                    "Reserved: %.2f GiB", torch.cuda.memory_reserved() / GiB_bytes
                )
            raise RuntimeError(
                "Failed to create unquantized linear weights. "
                "This may be caused by insufficient memory to allocate "
                "the weight."
            ) from e
        layer.register_parameter("weight", weight)
        set_weight_attrs(weight, extra_weight_attrs)