From d3da2eea546b33b9444519f99c26721f7344117f Mon Sep 17 00:00:00 2001 From: Didier Durand <2927957+didier-durand@users.noreply.github.com> Date: Thu, 28 Aug 2025 14:37:38 +0200 Subject: [PATCH] [Doc]: fix typos in Python scripts (#23828) Signed-off-by: Didier Durand --- vllm/compilation/backends.py | 4 ++-- vllm/config/cache.py | 2 +- vllm/engine/arg_utils.py | 2 +- vllm/entrypoints/chat_utils.py | 2 +- vllm/entrypoints/openai/api_server.py | 2 +- .../quantization/compressed_tensors/compressed_tensors.py | 2 +- .../quantization/compressed_tensors/compressed_tensors_moe.py | 2 +- vllm/v1/cudagraph_dispatcher.py | 4 ++-- vllm/v1/worker/block_table.py | 2 +- vllm/v1/worker/cpu_model_runner.py | 2 +- 10 files changed, 12 insertions(+), 12 deletions(-) diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index fa86773d2474..3361b65a9b88 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -271,7 +271,7 @@ def split_graph(graph: fx.GraphModule, outputs.append( SplitItem(name, graph_id, (graph_id in split_op_graphs), module)) - # sort by intetger graph_id, rather than string name + # sort by integer graph_id, rather than string name outputs.sort(key=lambda x: x.graph_id) return split_gm, outputs @@ -424,7 +424,7 @@ class VllmBackend: # if the model is initialized with a non-empty prefix, # then usually it's enough to use that prefix, - # e.g. launguage_model, vision_model, etc. + # e.g. language_model, vision_model, etc. # when multiple parts are initialized as independent # models, we need to use the model_tag to distinguish # them, e.g. backbone (default), eagle_head, etc. diff --git a/vllm/config/cache.py b/vllm/config/cache.py index a9550d4390ad..3d2aa6b17be7 100644 --- a/vllm/config/cache.py +++ b/vllm/config/cache.py @@ -115,7 +115,7 @@ class CacheConfig: In some KV sharing setups, e.g. YOCO (https://arxiv.org/abs/2405.05254), some layers can skip tokens corresponding to prefill. This flag enables - attention metadata for eligible layers to be overriden with metadata + attention metadata for eligible layers to be overridden with metadata necessary for implementing this optimization in some models (e.g. Gemma3n) """ diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index e4d205aeb863..7802802f138b 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1053,7 +1053,7 @@ class EngineArgs: self.trust_remote_code, self.revision, self.code_revision, self.config_format) - # if loading a SpeculatorsConfig, load the specualtive_config + # if loading a SpeculatorsConfig, load the speculative_config # details from the config directly # no user input required / expected if isinstance(hf_config, SpeculatorsConfig): diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 7b11a50642de..1954cbcbf1ed 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -640,7 +640,7 @@ class BaseMultiModalContentParser(ABC): def __init__(self) -> None: super().__init__() - # stores model placehodlers list with corresponding + # stores model placeholders list with corresponding # general MM placeholder: # { # "<##IMAGE##>": ["", "", ""], diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 9a2470649c8d..a28d38729f9f 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -1096,7 +1096,7 @@ if envs.VLLM_SERVER_DEV_MODE: raise HTTPException(status_code=HTTPStatus.BAD_REQUEST.value, detail="Missing 'method' in request body") # For security reason, only serialized string args/kwargs are passed. - # User-defined `method` is responsible for deseralization if needed. + # User-defined `method` is responsible for deserialization if needed. args: list[str] = body.get("args", []) kwargs: dict[str, str] = body.get("kwargs", {}) timeout: Optional[float] = body.get("timeout") diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index 230572041c80..b07bf675ca47 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -579,7 +579,7 @@ class CompressedTensorsConfig(QuantizationConfig): format = scheme_dict.get("format") # Find the sparsity scheme of the layer - # assume that fused layers inerhit first component's sparsity scheme + # assume that fused layers inherit first component's sparsity scheme sparsity_targets = (self.sparsity_scheme_map.keys() - set(self.sparsity_ignore_list)) sparsity_scheme: Optional[SparsityCompressionConfig] = None diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index af9d1c46f68f..2cad9ff0d321 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -71,7 +71,7 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase): ) -> "CompressedTensorsMoEMethod": # TODO: @dsikka: refactor this to use schemes as other kernels # are supported + check if the layer is being ignored. - # Check if a using "Linear" to select scheems + # Check if a using "Linear" to select schemes if "Linear" in quant_config.target_scheme_map: matched_target = "Linear" else: diff --git a/vllm/v1/cudagraph_dispatcher.py b/vllm/v1/cudagraph_dispatcher.py index 02e65820b7c0..3b4f1d20b64f 100644 --- a/vllm/v1/cudagraph_dispatcher.py +++ b/vllm/v1/cudagraph_dispatcher.py @@ -11,7 +11,7 @@ logger = init_logger(__name__) class CudagraphDispatcher: """ - Runtime cudagraph dispatcher to dispach keys for multiple set of cudagraphs. + Runtime cudagraph dispatcher to dispatch keys for multiple set of cudagraphs. The dispatcher stores two sets of dispatch keys, one for PIECEWISE and one for FULL cudagraph runtime mode. The keys are initialized depending on @@ -21,7 +21,7 @@ class CudagraphDispatcher: At runtime, the dispatch method generates the runtime cudagraph mode (FULL, PIECEWISE, or NONE for no cudagraph) and the valid key (batch descriptor) - based on the input key. After dispatching (commuicate via forward context), + based on the input key. After dispatching (communicate via forward context), the cudagraph wrappers will trust the dispatch key to do either capturing or replaying (if mode matched), or pass through to the underlying runnable without cudagraph (if mode no match or mode is NONE). diff --git a/vllm/v1/worker/block_table.py b/vllm/v1/worker/block_table.py index 5662fc350e19..6ab5ce2748a4 100644 --- a/vllm/v1/worker/block_table.py +++ b/vllm/v1/worker/block_table.py @@ -110,7 +110,7 @@ class BlockTable: self.block_table_cpu.fill_(0) def get_device_tensor(self) -> torch.Tensor: - """Ruturns the device tensor of the block table.""" + """Returns the device tensor of the block table.""" return self.block_table def get_cpu_tensor(self) -> torch.Tensor: diff --git a/vllm/v1/worker/cpu_model_runner.py b/vllm/v1/worker/cpu_model_runner.py index 742e553b77e0..7d0726112704 100644 --- a/vllm/v1/worker/cpu_model_runner.py +++ b/vllm/v1/worker/cpu_model_runner.py @@ -43,7 +43,7 @@ class CPUModelRunner(GPUModelRunner): Args: scheduler_output: The scheduler output. """ - # Attention free models have zero kv_cache_goups, however models + # Attention free models have zero kv_cache_groups, however models # like Mamba are also attention free but use the kv_cache for # keeping its internal state. This is why we check the number # of kv_cache groups instead of solely checking