[Doc]: fix typos in Python comments (#24294)

Signed-off-by: Didier Durand <durand.didier@gmail.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
This commit is contained in:
Didier Durand 2025-09-06 04:41:12 +02:00 committed by GitHub
parent 35efa70297
commit 35bf193864
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
12 changed files with 17 additions and 17 deletions

View File

@ -417,7 +417,7 @@ def create_sources(impl_configs: list[ImplConfig], num_impl_files=8):
))
def prepacked_type_key(prepack_type: PrepackTypeConfig):
# For now we we can just use the first accumulator type seen since
# For now, we can just use the first accumulator type seen since
# the tensor core shapes/layouts don't vary based on accumulator
# type so we can generate less code this way
return (prepack_type.a, prepack_type.b_num_bits, prepack_type.convert)

View File

@ -180,7 +180,7 @@ Inference batch size is an important parameter for the performance. Larger batch
- Offline Inference: `256 * world_size`
- Online Serving: `128 * world_size`
vLLM CPU supports data parallel (DP), tensor parallel (TP) and pipeline parallel (PP) to leverage multiple CPU sockets and memory nodes. For more details of tuning DP, TP and PP, please refer to [Optimization and Tuning](../../configuration/optimization.md). For vLLM CPU, it is recommend to use DP, TP and PP together if there are enough CPU sockets and memory nodes.
vLLM CPU supports data parallel (DP), tensor parallel (TP) and pipeline parallel (PP) to leverage multiple CPU sockets and memory nodes. For more details of tuning DP, TP and PP, please refer to [Optimization and Tuning](../../configuration/optimization.md). For vLLM CPU, it is recommended to use DP, TP and PP together if there are enough CPU sockets and memory nodes.
### Which quantization configs does vLLM CPU support?

View File

@ -42,7 +42,7 @@ def run_test(
tensor_parallel_size: int = 1,
vllm_embeddings: Optional[torch.Tensor] = None,
):
"""Modality agnostic test test executor for comparing HF/vLLM outputs."""
"""Modality agnostic test executor for comparing HF/vLLM outputs."""
# In the case of embeddings, vLLM takes separate input tensors
vllm_inputs = vllm_embeddings if vllm_embeddings is not None else inputs

View File

@ -60,7 +60,7 @@ class CustomAllreduce:
group: the process group to work on. If None, it will use the
default process group.
device: the device to bind the CustomAllreduce to. If None,
it will be bind to f"cuda:{local_rank}".
it will be bound to f"cuda:{local_rank}".
It is the caller's responsibility to make sure each communicator
is bind to a unique device, and all communicators in this group
are in the same node.
@ -158,7 +158,7 @@ class CustomAllreduce:
self.disabled = False
# Buffers memory are owned by this Python class and passed to C++.
# Meta data composes of two parts: meta data for synchronization and a
# Metadata composes of two parts: metadata for synchronization and a
# temporary buffer for storing intermediate allreduce results.
self.meta_ptrs = self.create_shared_buffer(ops.meta_size() + max_size,
group=group,

View File

@ -35,7 +35,7 @@ class Internlm2ToolParser(ToolParser):
self, request: ChatCompletionRequest) -> ChatCompletionRequest:
if request.tools and request.tool_choice != 'none':
# do not skip special tokens because internlm use the special
# tokens to indicated the start and end of the tool calls
# tokens to indicate the start and end of the tool calls
# information.
request.skip_special_tokens = False
return request
@ -60,8 +60,8 @@ class Internlm2ToolParser(ToolParser):
if '<|action_start|>' not in current_text:
self.position = len(current_text)
return DeltaMessage(content=delta_text)
# if the tool call is sended, return a empty delta message
# to make sure the finish_reason will be send correctly.
# if the tool call is sended, return an empty delta message
# to make sure the finish_reason will be sent correctly.
if self.current_tool_id > 0:
return DeltaMessage(content='')

View File

@ -1064,7 +1064,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
# vllm should use flashinfer fused allreduce. The variable should be a
# JSON with the following format:
# { <world size>: <max size in mb> }
# Unspecified world sizes will fallback to
# Unspecified world sizes will fall back to
# { 2: 64, 4: 1, <everything else>: 0.5 }
"VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB":
lambda: json.loads(os.getenv(

View File

@ -534,7 +534,7 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
EM = sorted_token_ids.size(0)
if A.size(0) < config["BLOCK_SIZE_M"]:
# optimize for small batch_size.
# We assume that top_ids of each token is unique, so
# We assume that top_ids of each token is unique,
# so num_valid_experts <= batch_size <= BLOCK_SIZE_M,
# and we can skip some invalid blocks.
EM = min(sorted_token_ids.size(0),

View File

@ -710,7 +710,7 @@ def determine_expert_map(
# Create a tensor of size num_experts filled with -1
expert_map = torch.full((global_num_experts, ), -1, dtype=torch.int32)
# Create a expert map for the local experts
# Create an expert map for the local experts
start_idx = ep_rank * base_experts + min(ep_rank, remainder)
expert_map[start_idx:start_idx + local_num_experts] = torch.arange(
0, local_num_experts, dtype=torch.int32)
@ -806,7 +806,7 @@ class FusedMoE(CustomOp):
self.global_num_experts = num_experts + num_redundant_experts
# we padding globally so EP buffer allocation works
# we are padding globally so EP buffer allocation works
if quant_config and quant_config.get_name() == "mxfp4":
from vllm.model_executor.layers.quantization.mxfp4 import ( # noqa: E501
should_use_flashinfer_mxfp4)

View File

@ -469,7 +469,7 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
)
layer.register_parameter("w2_scales", w2_scales)
set_weight_attrs(w2_scales, extra_weight_attrs)
# dont shard the w2 scales when running act order
# don't shard the w2 scales when running act order
set_weight_attrs(w2_scales,
{"load_full_w2": self.quant_config.desc_act})
# up_proj scales
@ -493,7 +493,7 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
)
layer.register_parameter("w2_qzeros", w2_qzeros)
set_weight_attrs(w2_qzeros, extra_weight_attrs)
# dont shard the w2 scales when running act order
# don't shard the w2 scales when running act order
set_weight_attrs(w2_qzeros,
{"load_full_w2": self.quant_config.desc_act})
w13_g_idx = torch.nn.Parameter(

View File

@ -687,7 +687,7 @@ class FlashInferImpl(AttentionImpl):
else:
raise ValueError(f"Unsupported output dtype: {output.dtype}")
# TRTLLM attn kernel requires o scale to pass as a host scalar,
# TRTLLM attn kernel requires to scale to pass as a host scalar,
# store the o scale as a host scalar in warmup run with cuda graph
# not enabled
if layer._o_scale_float is None:

View File

@ -439,7 +439,7 @@ class EngineCore:
"""
# Note on thread safety: no race condition.
# `mm_receiver_cache` is reset at the end of LLMEngine init,
# and will only accessed in the input processing thread afterwards.
# and will only be accessed in the input processing thread afterwards.
if self.mm_receiver_cache is not None and request.mm_features:
request.mm_features = (
self.mm_receiver_cache.get_and_update_features(

View File

@ -2826,7 +2826,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
# Disable cudagraph capturing globally, so any unexpected cudagraph
# capturing will be detected and raise an error after here.
# Note: We don't put it into graph_capture context manager because
# we may doing lazy capturing in future that still allows capturing
# we may do lazy capturing in future that still allows capturing
# after here.
set_cudagraph_capturing_enabled(False)