mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 05:15:42 +08:00
[Doc]: fixing typos in diverse files (#29492)
Signed-off-by: Didier Durand <durand.didier@gmail.com>
This commit is contained in:
parent
bab438ff3e
commit
66d3d5422c
@ -1005,7 +1005,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
|
|||||||
help="Key-value pairs (e.g, --header x-additional-info=0.3.3) "
|
help="Key-value pairs (e.g, --header x-additional-info=0.3.3) "
|
||||||
"for headers to be passed with each request. These headers override "
|
"for headers to be passed with each request. These headers override "
|
||||||
"per backend constants and values set via environment variable, and "
|
"per backend constants and values set via environment variable, and "
|
||||||
"will be overriden by other arguments (such as request ids).",
|
"will be overridden by other arguments (such as request ids).",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--max-concurrency",
|
"--max-concurrency",
|
||||||
@ -1138,7 +1138,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
|
|||||||
"--percentile-metrics",
|
"--percentile-metrics",
|
||||||
type=str,
|
type=str,
|
||||||
default=None,
|
default=None,
|
||||||
help="Comma-separated list of selected metrics to report percentils. "
|
help="Comma-separated list of selected metrics to report percentiles. "
|
||||||
"This argument specifies the metrics to report percentiles. "
|
"This argument specifies the metrics to report percentiles. "
|
||||||
'Allowed metric names are "ttft", "tpot", "itl", "e2el". '
|
'Allowed metric names are "ttft", "tpot", "itl", "e2el". '
|
||||||
'If not specified, defaults to "ttft,tpot,itl" for generative models '
|
'If not specified, defaults to "ttft,tpot,itl" for generative models '
|
||||||
|
|||||||
@ -238,9 +238,9 @@ class ParallelConfig:
|
|||||||
cp_kv_cache_interleave_size: int = 1
|
cp_kv_cache_interleave_size: int = 1
|
||||||
"""Interleave size of kv_cache storage while using DCP or PCP.
|
"""Interleave size of kv_cache storage while using DCP or PCP.
|
||||||
For `total_cp_rank = pcp_rank * dcp_world_size + dcp_rank`,
|
For `total_cp_rank = pcp_rank * dcp_world_size + dcp_rank`,
|
||||||
and `total_cp_world_size = pcp_world_size * dcp_world_szie`.
|
and `total_cp_world_size = pcp_world_size * dcp_world_size`.
|
||||||
store interleave_size tokens on total_cp_rank i,
|
store interleave_size tokens on total_cp_rank i,
|
||||||
then store next interleave_size tokens on taotal_cp_rank i+1.
|
then store next interleave_size tokens on total_cp_rank i+1.
|
||||||
Interleave_size=1: token-level alignment, where token `i` is stored on
|
Interleave_size=1: token-level alignment, where token `i` is stored on
|
||||||
total_cp_rank `i % total_cp_world_size`.
|
total_cp_rank `i % total_cp_world_size`.
|
||||||
Interleave_size=block_size: block-level alignment, where tokens are
|
Interleave_size=block_size: block-level alignment, where tokens are
|
||||||
|
|||||||
@ -173,7 +173,7 @@ class PunicaWrapperBase(PunicaWrapperABC):
|
|||||||
vocab_size: int,
|
vocab_size: int,
|
||||||
):
|
):
|
||||||
# NOTE We have remove lora extra vocab support for now. So we set
|
# NOTE We have remove lora extra vocab support for now. So we set
|
||||||
# extra_vocab_size alwayzs to 0, and extra_vocab_size will be removed.
|
# extra_vocab_size always to 0, and extra_vocab_size will be removed.
|
||||||
|
|
||||||
extra_vocab_size = 0
|
extra_vocab_size = 0
|
||||||
(
|
(
|
||||||
|
|||||||
@ -181,7 +181,7 @@ def apply_top_k_top_p(
|
|||||||
after thresholding the logit using this cut-off, the remaining elements
|
after thresholding the logit using this cut-off, the remaining elements
|
||||||
shall constitute the top-p set.
|
shall constitute the top-p set.
|
||||||
|
|
||||||
Note: in the case of tie (i.e. multipple cut-off elements present in the
|
Note: in the case of tie (i.e. multiple cut-off elements present in the
|
||||||
logit), all tie elements are included in the top-p set. In other words,
|
logit), all tie elements are included in the top-p set. In other words,
|
||||||
this function does not break ties. Instead, these tie tokens have equal
|
this function does not break ties. Instead, these tie tokens have equal
|
||||||
chance of being chosen during final sampling, so we can consider the tie
|
chance of being chosen during final sampling, so we can consider the tie
|
||||||
|
|||||||
@ -24,12 +24,14 @@ def _get_device_and_group(parallel_config: ParallelConfig):
|
|||||||
device = get_dp_group().device
|
device = get_dp_group().device
|
||||||
group = get_dp_group().device_group
|
group = get_dp_group().device_group
|
||||||
|
|
||||||
# Transfering this tensor from GPU to CPU will introduce a GPU sync
|
# Transferring this tensor from GPU to CPU will introduce a GPU sync
|
||||||
# point that could adversely affect performance of vllm with asynch
|
# point that could adversely affect performance of vllm with asynch
|
||||||
# scheduling. This environment variable exists to quickly disable
|
# scheduling. This environment variable exists to quickly disable
|
||||||
# this optimization if we run into this case.
|
# this optimization if we run into this case.
|
||||||
if parallel_config.disable_nccl_for_dp_synchronization:
|
if parallel_config.disable_nccl_for_dp_synchronization:
|
||||||
logger.info_once("Using CPU all reduce to syncronize DP padding between ranks.")
|
logger.info_once(
|
||||||
|
"Using CPU all reduce to synchronize DP padding between ranks."
|
||||||
|
)
|
||||||
device = "cpu"
|
device = "cpu"
|
||||||
group = get_dp_group().cpu_group
|
group = get_dp_group().cpu_group
|
||||||
return device, group
|
return device, group
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user