mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-09 22:25:32 +08:00
[Log] Optimize startup log (#28948)
Signed-off-by: yewentao256 <zhyanwentao@126.com> Signed-off-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Co-authored-by: Nick Hill <nhill@redhat.com>
This commit is contained in:
parent
b7f1f490a6
commit
a42ab317ac
@ -872,8 +872,10 @@ def get_moe_configs(
|
|||||||
for config_file_path in config_file_paths:
|
for config_file_path in config_file_paths:
|
||||||
if os.path.exists(config_file_path):
|
if os.path.exists(config_file_path):
|
||||||
with open(config_file_path) as f:
|
with open(config_file_path) as f:
|
||||||
logger.info(
|
logger.info_once(
|
||||||
"Using configuration from %s for MoE layer.", config_file_path
|
"Using configuration from %s for MoE layer.",
|
||||||
|
config_file_path,
|
||||||
|
scope="global",
|
||||||
)
|
)
|
||||||
# If a configuration has been found, return it
|
# If a configuration has been found, return it
|
||||||
tuned_config = json.load(f)
|
tuned_config = json.load(f)
|
||||||
|
|||||||
@ -162,9 +162,11 @@ def get_fp8_moe_backend(block_quant: bool) -> Fp8MoeBackend:
|
|||||||
# deepGEMM on supported platforms with block-quantized weights
|
# deepGEMM on supported platforms with block-quantized weights
|
||||||
if envs.VLLM_USE_DEEP_GEMM and envs.VLLM_MOE_USE_DEEP_GEMM and block_quant:
|
if envs.VLLM_USE_DEEP_GEMM and envs.VLLM_MOE_USE_DEEP_GEMM and block_quant:
|
||||||
if not has_deep_gemm():
|
if not has_deep_gemm():
|
||||||
logger.warning_once("DeepGEMM backend requested but not available.")
|
logger.warning_once(
|
||||||
|
"DeepGEMM backend requested but not available.", scope="local"
|
||||||
|
)
|
||||||
elif is_deep_gemm_supported():
|
elif is_deep_gemm_supported():
|
||||||
logger.info_once("Using DeepGEMM backend for FP8 MoE")
|
logger.info_once("Using DeepGEMM backend for FP8 MoE", scope="local")
|
||||||
return Fp8MoeBackend.DEEPGEMM
|
return Fp8MoeBackend.DEEPGEMM
|
||||||
|
|
||||||
# CUTLASS BlockScaled GroupedGemm on SM100 with block-quantized weights
|
# CUTLASS BlockScaled GroupedGemm on SM100 with block-quantized weights
|
||||||
@ -173,7 +175,9 @@ def get_fp8_moe_backend(block_quant: bool) -> Fp8MoeBackend:
|
|||||||
and current_platform.is_device_capability(100)
|
and current_platform.is_device_capability(100)
|
||||||
and block_quant
|
and block_quant
|
||||||
):
|
):
|
||||||
logger.info_once("Using Cutlass BlockScaled GroupedGemm backend for FP8 MoE")
|
logger.info_once(
|
||||||
|
"Using Cutlass BlockScaled GroupedGemm backend for FP8 MoE", scope="local"
|
||||||
|
)
|
||||||
return Fp8MoeBackend.CUTLASS_BLOCK_SCALED_GROUPED_GEMM
|
return Fp8MoeBackend.CUTLASS_BLOCK_SCALED_GROUPED_GEMM
|
||||||
|
|
||||||
# default to Triton
|
# default to Triton
|
||||||
|
|||||||
@ -139,18 +139,19 @@ class TorchProfilerWrapper(WorkerProfiler):
|
|||||||
|
|
||||||
self.local_rank = local_rank
|
self.local_rank = local_rank
|
||||||
torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR
|
torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR
|
||||||
logger.info(
|
if local_rank in (None, 0):
|
||||||
"Torch profiling enabled. Traces will be saved to: %s",
|
logger.info(
|
||||||
torch_profiler_trace_dir,
|
"Torch profiling enabled. Traces will be saved to: %s",
|
||||||
)
|
torch_profiler_trace_dir,
|
||||||
logger.debug(
|
)
|
||||||
"Profiler config: record_shapes=%s,"
|
logger.debug(
|
||||||
"profile_memory=%s,with_stack=%s,with_flops=%s",
|
"Profiler config: record_shapes=%s,"
|
||||||
envs.VLLM_TORCH_PROFILER_RECORD_SHAPES,
|
"profile_memory=%s,with_stack=%s,with_flops=%s",
|
||||||
envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY,
|
envs.VLLM_TORCH_PROFILER_RECORD_SHAPES,
|
||||||
envs.VLLM_TORCH_PROFILER_WITH_STACK,
|
envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY,
|
||||||
envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
|
envs.VLLM_TORCH_PROFILER_WITH_STACK,
|
||||||
)
|
envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
|
||||||
|
)
|
||||||
self.profiler = torch.profiler.profile(
|
self.profiler = torch.profiler.profile(
|
||||||
activities=[
|
activities=[
|
||||||
torch.profiler.ProfilerActivity.CPU,
|
torch.profiler.ProfilerActivity.CPU,
|
||||||
|
|||||||
@ -1236,10 +1236,11 @@ def _report_kv_cache_config(
|
|||||||
max_concurrency = get_max_concurrency_for_kv_cache_config(
|
max_concurrency = get_max_concurrency_for_kv_cache_config(
|
||||||
vllm_config, kv_cache_config
|
vllm_config, kv_cache_config
|
||||||
)
|
)
|
||||||
logger.info(
|
logger.info_once(
|
||||||
"Maximum concurrency for %s tokens per request: %.2fx",
|
"Maximum concurrency for %s tokens per request: %.2fx",
|
||||||
max_model_len_str,
|
max_model_len_str,
|
||||||
max_concurrency,
|
max_concurrency,
|
||||||
|
scope="local",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user