[Doc]: fix typos in Python comments (#24173)

Signed-off-by: Didier Durand <durand.didier@gmail.com> Co-authored-by: Russell Bryant <rbryant@redhat.com> Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
2025-12-10 14:25:46 +08:00 · 2025-09-04 17:52:17 +02:00 · 2025-09-04 17:52:17 +02:00 · 83609ca91d
commit 83609ca91d
parent e41a0fa377
12 changed files with 13 additions and 13 deletions
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@ -403,7 +403,7 @@ class RandomDataset(BenchmarkDataset):
            # [6880, 6881] -> ['Ġcalls', 'here'] ->
            # [1650, 939, 486] -> ['Ġcall', 'sh', 'ere']
            # To avoid uncontrolled change of the prompt length,
-            # the encoded sequence is truncated before being decode again.
+            # the encoded sequence is truncated before being decoded again.
            total_input_len = prefix_len + int(input_lens[i])
            re_encoded_sequence = tokenizer.encode(prompt, add_special_tokens=False)[
                :total_input_len
--- a/benchmarks/kernels/benchmark_lora.py
+++ b/benchmarks/kernels/benchmark_lora.py
@ -637,7 +637,7 @@ def bench_optype(
    # Clear LoRA optimization hash-maps.
    _LORA_A_PTR_DICT.clear()
    _LORA_B_PTR_DICT.clear()
-    # Run bench function so that _LORA_A_PTR_DICT and _LORA_B_PTR_DICT are setup
+    # Run bench function so that _LORA_A_PTR_DICT and _LORA_B_PTR_DICT are set up
    for kwargs in kwargs_list:
        op_type.bench_fn()(**kwargs)
    torch.cuda.synchronize()
--- a/benchmarks/multi_turn/benchmark_serving_multi_turn.py
+++ b/benchmarks/multi_turn/benchmark_serving_multi_turn.py
@ -962,7 +962,7 @@ async def main_mp(
    # At this point all the clients finished,
    # collect results (TTFT, TPOT, etc.) from all the clients.
-    # This needs to happens before calling join on the clients
+    # This needs to happen before calling join on the clients
    # (result_queue should be emptied).
    while not result_queue.empty():
        client_metrics.append(result_queue.get())
--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
@ -117,7 +117,7 @@ def run_gemma3n(question: str, audio_count: int) -> ModelRequestData:
 # Granite Speech
 def run_granite_speech(question: str, audio_count: int) -> ModelRequestData:
-    # NOTE - the setting in this example are somehat different than what is
+    # NOTE - the setting in this example are somewhat different from what is
    # optimal for granite speech, and it is generally recommended to use beam
    # search. Check the model README for suggested settings.
    # https://huggingface.co/ibm-granite/granite-speech-3.3-8b
--- a/tests/models/multimodal/generation/vlm_utils/builders.py
+++ b/tests/models/multimodal/generation/vlm_utils/builders.py
@ -250,7 +250,7 @@ def build_video_inputs_from_test_info(
 def apply_image_size_scaling(image, size: Union[float, tuple[int, int]],
                             size_type: SizeType):
-    """Applies a size scaler to one image; this can be a an image size factor,
+    """Applies a size scaler to one image; this can be an image size factor,
    which scales the image while maintaining the aspect ratio"""
    # Special case for embeddings; if it's a tensor, it's only valid if we
    # are considering size factors at constant scale, i.e., we just clone
--- a/tests/models/multimodal/generation/vlm_utils/case_filtering.py
+++ b/tests/models/multimodal/generation/vlm_utils/case_filtering.py
@ -42,7 +42,7 @@ def get_filtered_test_settings(
            else:
                assert test_info.prompt_formatter is not None
-            # Everything looks okay; keep if this is has correct proc handling
+            # Everything looks okay; keep if this is correct proc handling
            if (test_info.distributed_executor_backend
                    is not None) == new_proc_per_test:
                matching_tests[test_name] = test_info
--- a/vllm/attention/backends/mla/common.py
+++ b/vllm/attention/backends/mla/common.py
@ -822,7 +822,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[T], Generic[T]):
            and context_lens_tensor is not None \
            and context_lens_tensor[:self.num_prefills].max() > 0:
-            # NOTE: it is recommend you read the `Chunked Prefill` section in
+            # NOTE: it is recommended you read the `Chunked Prefill` section in
            # the comment at the top of the file before trying to understand
            # the following code
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@ -717,7 +717,7 @@ class AsyncLLMEngine(EngineClient):
                # Stop the execute model loop in parallel workers until there
                # are more requests to process. This avoids waiting
                # indefinitely in torch.distributed ops which may otherwise
-                # timeout, and unblocks the RPC thread in the workers so that
+                # time out, and unblocks the RPC thread in the workers so that
                # they can process any other queued control plane messages,
                # such as add/remove lora adapters.
                await engine.engine.stop_remote_worker_execution_loop_async()
--- a/vllm/engine/multiprocessing/client.py
+++ b/vllm/engine/multiprocessing/client.py
@ -270,7 +270,7 @@ class MQLLMEngineClient(EngineClient):
            queue.put_nowait(request_output)
    async def setup(self):
-        """Setup the client before it starts sending server requests."""
+        """Set up the client before it starts sending server requests."""
        # Start output_loop
        if self.output_loop is None:
--- a/vllm/model_executor/layers/quantization/awq_triton.py
+++ b/vllm/model_executor/layers/quantization/awq_triton.py
@ -19,7 +19,7 @@ def awq_dequantize_kernel(
        num_rows,  # input num rows in qweight
        BLOCK_SIZE_X: tl.constexpr,
        BLOCK_SIZE_Y: tl.constexpr):
-    # Setup the pids.
+    # Set up the pids.
    pid_x = tl.program_id(axis=0)
    pid_y = tl.program_id(axis=1)
--- a/vllm/model_executor/layers/quantization/base_config.py
+++ b/vllm/model_executor/layers/quantization/base_config.py
@ -128,7 +128,7 @@ class QuantizationConfig(ABC):
    @staticmethod
    def get_from_keys_or(config: dict[str, Any], keys: list[str],
                         default: Any) -> Any:
-        """Get a optional value from the model's quantization config."""
+        """Get an optional value from the model's quantization config."""
        try:
            return QuantizationConfig.get_from_keys(config, keys)
        except ValueError:
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@ -401,7 +401,7 @@ M = TypeVar("M", bound=MLACommonMetadata)
 def use_flashinfer_prefill() -> bool:
-    # For blackwell default to flashinfer prefill if its available since
+    # For blackwell default to flashinfer prefill if it's available since
    # it is faster than FA2.
    return (flashinfer_available and not envs.VLLM_USE_CUDNN_PREFILL
            and current_platform.is_device_capability(100))
@ -1018,7 +1018,7 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
            return layer.weight
        # we currently do not have quantized bmm's which are needed for
-        # `W_UV` and `W_UK_T`, we we just store fp16/bf16 copies and perform
+        # `W_UV` and `W_UK_T`, we just store fp16/bf16 copies and perform
        # the bmm's in 16-bit, the extra memory overhead of this is fairly low
        kv_b_proj_weight = get_and_maybe_dequant_weights(self.kv_b_proj).T
        assert kv_b_proj_weight.shape == (