From 3bdd426636cec97d4cd5cff0e1a057b45429e07c Mon Sep 17 00:00:00 2001 From: Wilson Wu Date: Wed, 10 Dec 2025 12:05:28 +0800 Subject: [PATCH] Fix typos in comments across multiple files (#30345) Signed-off-by: Wilson Wu Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> --- csrc/cpu/cpu_attn_impl.hpp | 2 +- csrc/quantization/machete/machete_mainloop.cuh | 2 +- docs/features/nixl_connector_usage.md | 2 +- vllm/model_executor/layers/fused_moe/layer.py | 2 +- .../schemes/compressed_tensors_w4a16_nvfp4.py | 2 +- vllm/v1/worker/gpu_model_runner.py | 2 +- vllm/v1/worker/utils.py | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/csrc/cpu/cpu_attn_impl.hpp b/csrc/cpu/cpu_attn_impl.hpp index 02164ed3666e3..e3e077b845f4f 100644 --- a/csrc/cpu/cpu_attn_impl.hpp +++ b/csrc/cpu/cpu_attn_impl.hpp @@ -186,7 +186,7 @@ struct AttentionMetadata { // - Intermediate outputs: q_tile_size * head_dim * output_buffer_elem_size + 2 // * q_tile_size * 4, partial output, max + sum (float) // Reduction scratchpad contains: -// - flags: bool array to indicate wether the split is finished +// - flags: bool array to indicate whether the split is finished // - outputs: split_num * q_tile_size * head_dim * output_buffer_elem_size // - max, sum: 2 * split_num * q_tile_size * 4 class AttentionScratchPad { diff --git a/csrc/quantization/machete/machete_mainloop.cuh b/csrc/quantization/machete/machete_mainloop.cuh index 2f52a6b7a0246..9f02f4f179741 100644 --- a/csrc/quantization/machete/machete_mainloop.cuh +++ b/csrc/quantization/machete/machete_mainloop.cuh @@ -617,7 +617,7 @@ struct MacheteCollectiveMma { // Same as upstream, should be kept the same when possible, not formatted for // easier comparison - // with `SwapAB ? N : M -> M` since we dont support SwapAB + // with `SwapAB ? N : M -> M` since we don't support SwapAB // clang-format off template static bool diff --git a/docs/features/nixl_connector_usage.md b/docs/features/nixl_connector_usage.md index 84c8f9e77d6d3..601205e1ed0b1 100644 --- a/docs/features/nixl_connector_usage.md +++ b/docs/features/nixl_connector_usage.md @@ -22,7 +22,7 @@ python tools/install_nixl_from_source_ubuntu.py NixlConnector uses NIXL library for underlying communication, which supports multiple transport backends. UCX (Unified Communication X) is the primary default transport library used by NIXL. Configure transport environment variables: ```bash -# Example UCX configuration, adjust according to your enviroment +# Example UCX configuration, adjust according to your environment export UCX_TLS=all # or specify specific transports like "rc,ud,sm,^cuda_ipc" ..etc export UCX_NET_DEVICES=all # or specify network devices like "mlx5_0:1,mlx5_1:1" ``` diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index e635382068a63..61dd1892d67ea 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -881,7 +881,7 @@ class FusedMoE(CustomOp): # Record that the clone will be used by shared_experts_stream # to avoid gc issue from deallocation of hidden_states_clone # For more details: https://docs.pytorch.org/docs/stable/generated/torch.Tensor.record_stream.html # noqa: E501 - # NOTE: We dont need shared_output.record_stream(current_stream()) + # NOTE: We don't need shared_output.record_stream(current_stream()) # because we synch the streams before using shared_output. hidden_states_clone.record_stream(self.shared_experts_stream) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py index 3afadc6eb7e5b..d2701a464f129 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py @@ -28,7 +28,7 @@ class CompressedTensorsW4A16Fp4(CompressedTensorsScheme): @classmethod def get_min_capability(cls) -> int: - # dont restrict as emulations + # don't restrict as emulations return 80 def create_weights( diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index f6f89d6eb6736..39456d2e80ed0 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -4871,7 +4871,7 @@ class GPUModelRunner( # we need to adjust the cudagraph sizes to be a multiple of the uniform # decode query length to avoid: https://github.com/vllm-project/vllm/issues/28207 # temp-fix: https://github.com/vllm-project/vllm/issues/28207#issuecomment-3504004536 - # Will be removed in the near future when we have seperate cudagraph capture + # Will be removed in the near future when we have separate cudagraph capture # sizes for decode and mixed prefill-decode. if ( cudagraph_mode.decode_mode() == CUDAGraphMode.FULL diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index 0b0e2006d73d2..4dd9463ee6285 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -135,7 +135,7 @@ class AttentionGroup: kv_cache_spec: KVCacheSpec kv_cache_group_id: int # When ubatching is enabled we will have a metadata builder for each ubatch - # so that if they use internal persistant buffers for cudagraphs, and they + # so that if they use internal persistent buffers for cudagraphs, and they # won't have to worry about conflicting with the other ubatches. metadata_builders: list[AttentionMetadataBuilder] = field( default_factory=lambda: []