From 66d3d5422c9b90f1ee9593e1793e86f14e4eb3f4 Mon Sep 17 00:00:00 2001 From: Didier Durand <2927957+didier-durand@users.noreply.github.com> Date: Thu, 27 Nov 2025 16:15:50 +0100 Subject: [PATCH] [Doc]: fixing typos in diverse files (#29492) Signed-off-by: Didier Durand --- vllm/benchmarks/serve.py | 4 ++-- vllm/config/parallel.py | 4 ++-- vllm/lora/punica_wrapper/punica_base.py | 2 +- vllm/model_executor/models/adapters.py | 4 ++-- vllm/v1/sample/tpu/sampler.py | 2 +- vllm/v1/worker/dp_utils.py | 6 ++++-- 6 files changed, 12 insertions(+), 10 deletions(-) diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py index dddb050ec180..519303c0bfa0 100644 --- a/vllm/benchmarks/serve.py +++ b/vllm/benchmarks/serve.py @@ -1005,7 +1005,7 @@ def add_cli_args(parser: argparse.ArgumentParser): help="Key-value pairs (e.g, --header x-additional-info=0.3.3) " "for headers to be passed with each request. These headers override " "per backend constants and values set via environment variable, and " - "will be overriden by other arguments (such as request ids).", + "will be overridden by other arguments (such as request ids).", ) parser.add_argument( "--max-concurrency", @@ -1138,7 +1138,7 @@ def add_cli_args(parser: argparse.ArgumentParser): "--percentile-metrics", type=str, default=None, - help="Comma-separated list of selected metrics to report percentils. " + help="Comma-separated list of selected metrics to report percentiles. " "This argument specifies the metrics to report percentiles. " 'Allowed metric names are "ttft", "tpot", "itl", "e2el". ' 'If not specified, defaults to "ttft,tpot,itl" for generative models ' diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py index 7ba1da5db384..4a8c8bc17cfc 100644 --- a/vllm/config/parallel.py +++ b/vllm/config/parallel.py @@ -238,9 +238,9 @@ class ParallelConfig: cp_kv_cache_interleave_size: int = 1 """Interleave size of kv_cache storage while using DCP or PCP. For `total_cp_rank = pcp_rank * dcp_world_size + dcp_rank`, - and `total_cp_world_size = pcp_world_size * dcp_world_szie`. + and `total_cp_world_size = pcp_world_size * dcp_world_size`. store interleave_size tokens on total_cp_rank i, - then store next interleave_size tokens on taotal_cp_rank i+1. + then store next interleave_size tokens on total_cp_rank i+1. Interleave_size=1: token-level alignment, where token `i` is stored on total_cp_rank `i % total_cp_world_size`. Interleave_size=block_size: block-level alignment, where tokens are diff --git a/vllm/lora/punica_wrapper/punica_base.py b/vllm/lora/punica_wrapper/punica_base.py index ce38751e4b6a..47c42b095534 100644 --- a/vllm/lora/punica_wrapper/punica_base.py +++ b/vllm/lora/punica_wrapper/punica_base.py @@ -173,7 +173,7 @@ class PunicaWrapperBase(PunicaWrapperABC): vocab_size: int, ): # NOTE We have remove lora extra vocab support for now. So we set - # extra_vocab_size alwayzs to 0, and extra_vocab_size will be removed. + # extra_vocab_size always to 0, and extra_vocab_size will be removed. extra_vocab_size = 0 ( diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py index a9cc49451a1d..5aba46f8614b 100644 --- a/vllm/model_executor/models/adapters.py +++ b/vllm/model_executor/models/adapters.py @@ -428,7 +428,7 @@ def load_weights_using_from_2_way_softmax( ) if text_config.tie_word_embeddings: # embed_tokens is the assumed name for input embeddings. If the model does not - # have this attribute, we fallback to get_input_embeddings(), which is used by + # have this attribute, we fall back to get_input_embeddings(), which is used by # the Transformers modeling backend. embed_tokens = ( model.model.embed_tokens @@ -486,7 +486,7 @@ def load_weights_no_post_processing(model, weights: Iterable[tuple[str, torch.Te ) if text_config.tie_word_embeddings: # embed_tokens is the assumed name for input embeddings. If the model does not - # have this attribute, we fallback to get_input_embeddings(), which is used by + # have this attribute, we fall back to get_input_embeddings(), which is used by # the Transformers modeling backend. embed_tokens = ( model.model.embed_tokens diff --git a/vllm/v1/sample/tpu/sampler.py b/vllm/v1/sample/tpu/sampler.py index 8f0463c76ce1..6d992bb37a59 100644 --- a/vllm/v1/sample/tpu/sampler.py +++ b/vllm/v1/sample/tpu/sampler.py @@ -181,7 +181,7 @@ def apply_top_k_top_p( after thresholding the logit using this cut-off, the remaining elements shall constitute the top-p set. - Note: in the case of tie (i.e. multipple cut-off elements present in the + Note: in the case of tie (i.e. multiple cut-off elements present in the logit), all tie elements are included in the top-p set. In other words, this function does not break ties. Instead, these tie tokens have equal chance of being chosen during final sampling, so we can consider the tie diff --git a/vllm/v1/worker/dp_utils.py b/vllm/v1/worker/dp_utils.py index 064f2f0360cb..c1509de821b0 100644 --- a/vllm/v1/worker/dp_utils.py +++ b/vllm/v1/worker/dp_utils.py @@ -24,12 +24,14 @@ def _get_device_and_group(parallel_config: ParallelConfig): device = get_dp_group().device group = get_dp_group().device_group - # Transfering this tensor from GPU to CPU will introduce a GPU sync + # Transferring this tensor from GPU to CPU will introduce a GPU sync # point that could adversely affect performance of vllm with asynch # scheduling. This environment variable exists to quickly disable # this optimization if we run into this case. if parallel_config.disable_nccl_for_dp_synchronization: - logger.info_once("Using CPU all reduce to syncronize DP padding between ranks.") + logger.info_once( + "Using CPU all reduce to synchronize DP padding between ranks." + ) device = "cpu" group = get_dp_group().cpu_group return device, group