mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-03 23:37:56 +08:00
[Fix] Support passing args to logger (#17425)
Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
This commit is contained in:
parent
39317cf42b
commit
da4e7687b5
@ -278,7 +278,7 @@ class ModelConfig:
|
|||||||
max_model_len: int = None # type: ignore
|
max_model_len: int = None # type: ignore
|
||||||
"""Model context length (prompt and output). If unspecified, will be
|
"""Model context length (prompt and output). If unspecified, will be
|
||||||
automatically derived from the model config.
|
automatically derived from the model config.
|
||||||
|
|
||||||
When passing via `--max-model-len`, supports k/m/g/K/M/G in human-readable
|
When passing via `--max-model-len`, supports k/m/g/K/M/G in human-readable
|
||||||
format. Examples:\n
|
format. Examples:\n
|
||||||
- 1k -> 1000\n
|
- 1k -> 1000\n
|
||||||
@ -518,11 +518,11 @@ class ModelConfig:
|
|||||||
self.hf_text_config.sliding_window)
|
self.hf_text_config.sliding_window)
|
||||||
|
|
||||||
logger.warning_once(
|
logger.warning_once(
|
||||||
f"{self.hf_text_config.model_type} has interleaved "
|
"%s has interleaved attention, which is currently not supported by the %s backend. Disabling sliding window and capping the max length to the sliding window size (%d).", # noqa: E501
|
||||||
"attention, which is currently not supported by the "
|
self.hf_text_config.model_type,
|
||||||
f"{backend} backend. Disabling sliding window and capping "
|
backend,
|
||||||
"the max length to the sliding window size "
|
sliding_window_len_min,
|
||||||
f"({sliding_window_len_min}).")
|
)
|
||||||
self.disable_sliding_window = True
|
self.disable_sliding_window = True
|
||||||
else:
|
else:
|
||||||
# for a model with interleaved attention,
|
# for a model with interleaved attention,
|
||||||
|
|||||||
@ -5,6 +5,7 @@ import json
|
|||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
from collections.abc import Hashable
|
||||||
from functools import lru_cache, partial
|
from functools import lru_cache, partial
|
||||||
from logging import Logger
|
from logging import Logger
|
||||||
from logging.config import dictConfig
|
from logging.config import dictConfig
|
||||||
@ -52,15 +53,15 @@ DEFAULT_LOGGING_CONFIG = {
|
|||||||
|
|
||||||
|
|
||||||
@lru_cache
|
@lru_cache
|
||||||
def _print_info_once(logger: Logger, msg: str) -> None:
|
def _print_info_once(logger: Logger, msg: str, *args: Hashable) -> None:
|
||||||
# Set the stacklevel to 2 to print the original caller's line info
|
# Set the stacklevel to 2 to print the original caller's line info
|
||||||
logger.info(msg, stacklevel=2)
|
logger.info(msg, *args, stacklevel=2)
|
||||||
|
|
||||||
|
|
||||||
@lru_cache
|
@lru_cache
|
||||||
def _print_warning_once(logger: Logger, msg: str) -> None:
|
def _print_warning_once(logger: Logger, msg: str, *args: Hashable) -> None:
|
||||||
# Set the stacklevel to 2 to print the original caller's line info
|
# Set the stacklevel to 2 to print the original caller's line info
|
||||||
logger.warning(msg, stacklevel=2)
|
logger.warning(msg, *args, stacklevel=2)
|
||||||
|
|
||||||
|
|
||||||
class _VllmLogger(Logger):
|
class _VllmLogger(Logger):
|
||||||
@ -72,19 +73,19 @@ class _VllmLogger(Logger):
|
|||||||
`intel_extension_for_pytorch.utils._logger`.
|
`intel_extension_for_pytorch.utils._logger`.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def info_once(self, msg: str) -> None:
|
def info_once(self, msg: str, *args: Hashable) -> None:
|
||||||
"""
|
"""
|
||||||
As :meth:`info`, but subsequent calls with the same message
|
As :meth:`info`, but subsequent calls with the same message
|
||||||
are silently dropped.
|
are silently dropped.
|
||||||
"""
|
"""
|
||||||
_print_info_once(self, msg)
|
_print_info_once(self, msg, *args)
|
||||||
|
|
||||||
def warning_once(self, msg: str) -> None:
|
def warning_once(self, msg: str, *args: Hashable) -> None:
|
||||||
"""
|
"""
|
||||||
As :meth:`warning`, but subsequent calls with the same message
|
As :meth:`warning`, but subsequent calls with the same message
|
||||||
are silently dropped.
|
are silently dropped.
|
||||||
"""
|
"""
|
||||||
_print_warning_once(self, msg)
|
_print_warning_once(self, msg, *args)
|
||||||
|
|
||||||
|
|
||||||
def _configure_vllm_root_logger() -> None:
|
def _configure_vllm_root_logger() -> None:
|
||||||
|
|||||||
@ -15,6 +15,5 @@ def get_punica_wrapper(*args, **kwargs) -> PunicaWrapperBase:
|
|||||||
punica_wrapper = punica_wrapper_cls(*args, **kwargs)
|
punica_wrapper = punica_wrapper_cls(*args, **kwargs)
|
||||||
assert punica_wrapper is not None, \
|
assert punica_wrapper is not None, \
|
||||||
"the punica_wrapper_qualname(" + punica_wrapper_qualname + ") is wrong."
|
"the punica_wrapper_qualname(" + punica_wrapper_qualname + ") is wrong."
|
||||||
logger.info_once("Using " + punica_wrapper_qualname.rsplit(".", 1)[1] +
|
logger.info_once("Using %s.", punica_wrapper_qualname.rsplit(".", 1)[1])
|
||||||
".")
|
|
||||||
return punica_wrapper
|
return punica_wrapper
|
||||||
|
|||||||
@ -107,9 +107,9 @@ class CustomOp(nn.Module):
|
|||||||
custom_ops = compilation_config.custom_ops
|
custom_ops = compilation_config.custom_ops
|
||||||
if not hasattr(cls, "name"):
|
if not hasattr(cls, "name"):
|
||||||
logger.warning_once(
|
logger.warning_once(
|
||||||
f"Custom op {cls.__name__} was not registered, "
|
"Custom op %s was not registered, which means it won't appear in the op registry. It will be enabled/disabled based on the global settings.", # noqa: E501
|
||||||
f"which means it won't appear in the op registry. "
|
cls.__name__,
|
||||||
f"It will be enabled/disabled based on the global settings.")
|
)
|
||||||
return CustomOp.default_on()
|
return CustomOp.default_on()
|
||||||
|
|
||||||
enabled = f"+{cls.name}" in custom_ops
|
enabled = f"+{cls.name}" in custom_ops
|
||||||
|
|||||||
@ -191,9 +191,9 @@ class GrammarConfig:
|
|||||||
|
|
||||||
if model_with_warn is not None and any_whitespace:
|
if model_with_warn is not None and any_whitespace:
|
||||||
logger.info_once(
|
logger.info_once(
|
||||||
f"{model_with_warn} model detected, consider setting "
|
"%s model detected, consider setting `disable_any_whitespace` to prevent runaway generation of whitespaces.", # noqa: E501
|
||||||
"`disable_any_whitespace` to prevent runaway generation "
|
model_with_warn,
|
||||||
"of whitespaces.")
|
)
|
||||||
# Validate the schema and raise ValueError here if it is invalid.
|
# Validate the schema and raise ValueError here if it is invalid.
|
||||||
# This is to avoid exceptions in model execution, which will crash
|
# This is to avoid exceptions in model execution, which will crash
|
||||||
# the engine worker process.
|
# the engine worker process.
|
||||||
|
|||||||
@ -130,8 +130,9 @@ class AWQMarlinConfig(QuantizationConfig):
|
|||||||
# Check if the layer is supported by AWQMarlin.
|
# Check if the layer is supported by AWQMarlin.
|
||||||
if not check_marlin_supports_layer(layer, self.group_size):
|
if not check_marlin_supports_layer(layer, self.group_size):
|
||||||
logger.warning_once(
|
logger.warning_once(
|
||||||
f"Layer '{prefix}' is not supported by AWQMarlin. "
|
"Layer '%s' is not supported by AWQMarlin. Falling back to unoptimized AWQ kernels.", # noqa: E501
|
||||||
"Falling back to unoptimized AWQ kernels.")
|
prefix,
|
||||||
|
)
|
||||||
return AWQConfig.from_config(
|
return AWQConfig.from_config(
|
||||||
self.full_config).get_quant_method(layer, prefix)
|
self.full_config).get_quant_method(layer, prefix)
|
||||||
return AWQMarlinLinearMethod(self)
|
return AWQMarlinLinearMethod(self)
|
||||||
|
|||||||
@ -464,7 +464,7 @@ def fastsafetensors_weights_iterator(
|
|||||||
hf_weights_files: List[str],
|
hf_weights_files: List[str],
|
||||||
use_tqdm_on_load: bool,
|
use_tqdm_on_load: bool,
|
||||||
) -> Generator[Tuple[str, torch.Tensor], None, None]:
|
) -> Generator[Tuple[str, torch.Tensor], None, None]:
|
||||||
"""Iterate over the weights in the model safetensor files
|
"""Iterate over the weights in the model safetensor files
|
||||||
using fastsafetensor library."""
|
using fastsafetensor library."""
|
||||||
if torch.distributed.is_initialized():
|
if torch.distributed.is_initialized():
|
||||||
pg = torch.distributed.group.WORLD
|
pg = torch.distributed.group.WORLD
|
||||||
@ -716,10 +716,10 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> Optional[str]:
|
|||||||
remapped_name = name.replace(".kv_scale", ".attn.k_scale")
|
remapped_name = name.replace(".kv_scale", ".attn.k_scale")
|
||||||
if remapped_name not in params_dict:
|
if remapped_name not in params_dict:
|
||||||
logger.warning_once(
|
logger.warning_once(
|
||||||
f"Found kv_scale in the checkpoint (e.g. {name}), "
|
"Found kv_scale in the checkpoint (e.g. %s), but not found the expected name in the model (e.g. %s). kv_scale is not loaded.", # noqa: E501
|
||||||
"but not found the expected name in the model "
|
name,
|
||||||
f"(e.g. {remapped_name}). kv_scale is "
|
remapped_name,
|
||||||
"not loaded.")
|
)
|
||||||
return None
|
return None
|
||||||
return remapped_name
|
return remapped_name
|
||||||
|
|
||||||
@ -738,10 +738,12 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> Optional[str]:
|
|||||||
remapped_name = name.replace(scale_name, f".attn{scale_name}")
|
remapped_name = name.replace(scale_name, f".attn{scale_name}")
|
||||||
if remapped_name not in params_dict:
|
if remapped_name not in params_dict:
|
||||||
logger.warning_once(
|
logger.warning_once(
|
||||||
f"Found {scale_name} in the checkpoint (e.g. {name}), "
|
"Found %s in the checkpoint (e.g. %s), but not found the expected name in the model (e.g. %s). %s is not loaded.", # noqa: E501
|
||||||
"but not found the expected name in the model "
|
scale_name,
|
||||||
f"(e.g. {remapped_name}). {scale_name} is "
|
name,
|
||||||
"not loaded.")
|
remapped_name,
|
||||||
|
scale_name,
|
||||||
|
)
|
||||||
return None
|
return None
|
||||||
return remapped_name
|
return remapped_name
|
||||||
|
|
||||||
|
|||||||
@ -1111,10 +1111,10 @@ class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal,
|
|||||||
".kv_scale", ".attn.kv_scale")
|
".kv_scale", ".attn.kv_scale")
|
||||||
if remapped_kv_scale_name not in params_dict:
|
if remapped_kv_scale_name not in params_dict:
|
||||||
logger.warning_once(
|
logger.warning_once(
|
||||||
"Found kv scale in the checkpoint (e.g. "
|
"Found kv scale in the checkpoint (e.g. %s), but not found the expected name in the model (e.g. %s). kv-scale is not loaded.", # noqa: E501
|
||||||
f"{name}), but not found the expected name in "
|
name,
|
||||||
f"the model (e.g. {remapped_kv_scale_name}). "
|
remapped_kv_scale_name,
|
||||||
"kv-scale is not loaded.")
|
)
|
||||||
continue
|
continue
|
||||||
else:
|
else:
|
||||||
name = remapped_kv_scale_name
|
name = remapped_kv_scale_name
|
||||||
|
|||||||
@ -385,11 +385,10 @@ class OlmoeModel(nn.Module):
|
|||||||
".kv_scale", ".attn.kv_scale")
|
".kv_scale", ".attn.kv_scale")
|
||||||
if remapped_kv_scale_name not in params_dict:
|
if remapped_kv_scale_name not in params_dict:
|
||||||
logger.warning_once(
|
logger.warning_once(
|
||||||
"Found kv scale in the checkpoint "
|
"Found kv scale in the checkpoint (e.g. %s), but not found the expected name in the model (e.g. %s). kv-scale is not loaded.", # noqa: E501
|
||||||
f"(e.g. {name}), but not found the expected "
|
name,
|
||||||
f"name in the model "
|
remapped_kv_scale_name,
|
||||||
f"(e.g. {remapped_kv_scale_name}). "
|
)
|
||||||
"kv-scale is not loaded.")
|
|
||||||
continue
|
continue
|
||||||
else:
|
else:
|
||||||
name = remapped_kv_scale_name
|
name = remapped_kv_scale_name
|
||||||
|
|||||||
@ -462,11 +462,10 @@ class Qwen2MoeModel(nn.Module):
|
|||||||
".kv_scale", ".attn.kv_scale")
|
".kv_scale", ".attn.kv_scale")
|
||||||
if remapped_kv_scale_name not in params_dict:
|
if remapped_kv_scale_name not in params_dict:
|
||||||
logger.warning_once(
|
logger.warning_once(
|
||||||
"Found kv scale in the checkpoint "
|
"Found kv_scale in the checkpoint (e.g. %s), but not found the expected name in the model (e.g. %s). kv_scale is not loaded.", # noqa: E501
|
||||||
f"(e.g. {name}), but not found the expected "
|
name,
|
||||||
f"name in the model "
|
remapped_kv_scale_name,
|
||||||
f"(e.g. {remapped_kv_scale_name}). "
|
)
|
||||||
"kv-scale is not loaded.")
|
|
||||||
continue
|
continue
|
||||||
else:
|
else:
|
||||||
name = remapped_kv_scale_name
|
name = remapped_kv_scale_name
|
||||||
|
|||||||
@ -459,11 +459,10 @@ class Qwen3MoeModel(nn.Module):
|
|||||||
".kv_scale", ".attn.kv_scale")
|
".kv_scale", ".attn.kv_scale")
|
||||||
if remapped_kv_scale_name not in params_dict:
|
if remapped_kv_scale_name not in params_dict:
|
||||||
logger.warning_once(
|
logger.warning_once(
|
||||||
"Found kv scale in the checkpoint "
|
"Found kv scale in the checkpoint (e.g. %s), but not found the expected name in the model (e.g. %s). kv-scale is not loaded.", # noqa: E501
|
||||||
f"(e.g. {name}), but not found the expected "
|
name,
|
||||||
f"name in the model "
|
remapped_kv_scale_name,
|
||||||
f"(e.g. {remapped_kv_scale_name}). "
|
)
|
||||||
"kv-scale is not loaded.")
|
|
||||||
continue
|
continue
|
||||||
else:
|
else:
|
||||||
name = remapped_kv_scale_name
|
name = remapped_kv_scale_name
|
||||||
|
|||||||
@ -215,17 +215,14 @@ class MultiModalProfiler(Generic[_I]):
|
|||||||
elif total_len > seq_len and not envs.VLLM_USE_V1:
|
elif total_len > seq_len and not envs.VLLM_USE_V1:
|
||||||
# `max_num_batched_tokens` is defined by `SchedulerConfig`
|
# `max_num_batched_tokens` is defined by `SchedulerConfig`
|
||||||
logger.warning_once(
|
logger.warning_once(
|
||||||
"The encoder sequence length used for profiling ("
|
"The encoder sequence length used for profiling (max_num_batched_tokens / max_num_seqs = %d) " # noqa: E501
|
||||||
f"max_num_batched_tokens / max_num_seqs = {seq_len}) "
|
"is too short to hold the multi-modal embeddings in the worst case (%d tokens in total, out of which %s are reserved for multi-modal embeddings). " # noqa: E501
|
||||||
" is too short "
|
"This may cause certain multi-modal inputs to fail during inference, even when the input text is short. " # noqa: E501
|
||||||
"to hold the multi-modal embeddings in the worst case "
|
"To avoid this, you should increase `max_model_len`, reduce `max_num_seqs`, and/or reduce `mm_counts`.", # noqa: E501
|
||||||
f"({total_len} tokens in total, out of which "
|
seq_len,
|
||||||
f"{self._get_mm_num_tokens(mm_inputs)} are reserved for "
|
total_len,
|
||||||
"multi-modal embeddings). This may cause certain "
|
str(self._get_mm_num_tokens(mm_inputs)),
|
||||||
"multi-modal inputs to fail during inference, even when "
|
)
|
||||||
"the input text is short. To avoid this, you should "
|
|
||||||
"increase `max_model_len`, reduce `max_num_seqs`, "
|
|
||||||
"and/or reduce `mm_counts`.")
|
|
||||||
|
|
||||||
return DummyEncoderData(encoder_prompt_token_ids)
|
return DummyEncoderData(encoder_prompt_token_ids)
|
||||||
|
|
||||||
@ -243,17 +240,14 @@ class MultiModalProfiler(Generic[_I]):
|
|||||||
if total_len > seq_len and not envs.VLLM_USE_V1:
|
if total_len > seq_len and not envs.VLLM_USE_V1:
|
||||||
# `max_num_batched_tokens` is defined by `SchedulerConfig`
|
# `max_num_batched_tokens` is defined by `SchedulerConfig`
|
||||||
logger.warning_once(
|
logger.warning_once(
|
||||||
"The sequence length used for profiling ("
|
"The sequence length used for profiling (max_num_batched_tokens / max_num_seqs = %d) " # noqa: E501
|
||||||
f"max_num_batched_tokens / max_num_seqs = {seq_len}) "
|
"is too short to hold the multi-modal embeddings in the worst case (%d tokens in total, out of which %s are reserved for multi-modal embeddings). " # noqa: E501
|
||||||
"is too short "
|
"This may cause certain multi-modal inputs to fail during inference, even when the input text is short. " # noqa: E501
|
||||||
"to hold the multi-modal embeddings in the worst case "
|
"To avoid this, you should increase `max_model_len`, reduce `max_num_seqs`, and/or reduce `mm_counts`.", # noqa: E501
|
||||||
f"({total_len} tokens in total, out of which "
|
seq_len,
|
||||||
f"{self._get_mm_num_tokens(mm_inputs)} are reserved for "
|
total_len,
|
||||||
"multi-modal embeddings). This may cause certain "
|
str(self._get_mm_num_tokens(mm_inputs)),
|
||||||
"multi-modal inputs to fail during inference, even when "
|
)
|
||||||
"the input text is short. To avoid this, you should "
|
|
||||||
"increase `max_model_len`, reduce `max_num_seqs`, "
|
|
||||||
"and/or reduce `mm_counts`.")
|
|
||||||
|
|
||||||
if total_len < seq_len:
|
if total_len < seq_len:
|
||||||
prompt_token_ids.extend([0] * (seq_len - total_len))
|
prompt_token_ids.extend([0] * (seq_len - total_len))
|
||||||
|
|||||||
@ -100,7 +100,7 @@ class MultiModalRegistry:
|
|||||||
model_config: "ModelConfig",
|
model_config: "ModelConfig",
|
||||||
) -> Mapping[str, int]:
|
) -> Mapping[str, int]:
|
||||||
"""
|
"""
|
||||||
Get the maximum number of tokens per data item from each modality based
|
Get the maximum number of tokens per data item from each modality based
|
||||||
on underlying model configuration.
|
on underlying model configuration.
|
||||||
"""
|
"""
|
||||||
if not model_config.is_multimodal_model:
|
if not model_config.is_multimodal_model:
|
||||||
@ -126,11 +126,11 @@ class MultiModalRegistry:
|
|||||||
) -> Mapping[str, int]:
|
) -> Mapping[str, int]:
|
||||||
"""
|
"""
|
||||||
Get the maximum number of tokens per data item from each modality based
|
Get the maximum number of tokens per data item from each modality based
|
||||||
on underlying model configuration, excluding modalities that user
|
on underlying model configuration, excluding modalities that user
|
||||||
explicitly disabled via `limit_mm_per_prompt`.
|
explicitly disabled via `limit_mm_per_prompt`.
|
||||||
|
|
||||||
Note:
|
Note:
|
||||||
This is currently directly used only in V1 for profiling the memory
|
This is currently directly used only in V1 for profiling the memory
|
||||||
usage of a model.
|
usage of a model.
|
||||||
"""
|
"""
|
||||||
mm_limits = self.get_mm_limits_per_prompt(model_config)
|
mm_limits = self.get_mm_limits_per_prompt(model_config)
|
||||||
@ -316,7 +316,9 @@ class MultiModalRegistry:
|
|||||||
token_ids = dummy_data.prompt_token_ids
|
token_ids = dummy_data.prompt_token_ids
|
||||||
if len(token_ids) < seq_len:
|
if len(token_ids) < seq_len:
|
||||||
logger.warning_once(
|
logger.warning_once(
|
||||||
f"Expected at least {seq_len} dummy encoder tokens for "
|
"Expected at least %d dummy encoder tokens for profiling, but found %d tokens instead.", # noqa: E501
|
||||||
f"profiling, but found {len(token_ids)} tokens instead.")
|
seq_len,
|
||||||
|
len(token_ids),
|
||||||
|
)
|
||||||
|
|
||||||
return dummy_data
|
return dummy_data
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user